File size: 3,329 Bytes
0fc7538
 
091d4d8
 
0fc7538
 
 
091d4d8
 
 
f63bd36
091d4d8
 
 
 
 
 
 
0fc7538
091d4d8
 
 
 
0fc7538
 
091d4d8
 
 
 
0fc7538
 
 
 
 
 
 
 
091d4d8
0fc7538
 
 
 
2ae8beb
0fc7538
f63bd36
0fc7538
 
 
 
091d4d8
 
 
 
0fc7538
091d4d8
0fc7538
 
 
091d4d8
0fc7538
 
 
2ae8beb
0fc7538
f63bd36
0fc7538
 
 
 
091d4d8
 
 
 
0fc7538
091d4d8
0fc7538
728e771
 
091d4d8
 
 
 
 
 
 
f63bd36
091d4d8
 
 
 
 
 
 
 
 
 
 
 
 
 
728e771
 
091d4d8
728e771
 
091d4d8
 
 
 
 
 
 
728e771
 
091d4d8
728e771
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from fastapi import FastAPI
import torch
import os
from llama_cpp import Llama
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cpu"

access_token = os.getenv("access_token")

tokenizer1 = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
tokenizer2 = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", token=access_token)
tokenizer3 = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

llm1 = Llama.from_pretrained(
    repo_id="Qwen/Qwen2-1.5B-Instruct-GGUF",
    filename="*q8_0.gguf",
    verbose=False
)

llm2 = Llama.from_pretrained(
    repo_id="NexaAIDev/gemma-2-2b-it-GGUF",
    filename="*q4_K_S.gguf",
    verbose=False
)

llm3 = Llama.from_pretrained(
    repo_id="microsoft/Phi-3-mini-4k-instruct-gguf",
    filename="*q4.gguf",
    verbose=False
)

app = FastAPI()

@app.get("/")
async def read_root():
    return {"Hello": "World!"}

def modelResp1(prompt):
    messages = [
        {"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
        {"role": "user", "content": "Who are you?"},
        {"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
        {"role": "user", "content": f"{prompt}"}
    ]
    text = tokenizer1.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    output = llm1(
      text,
      max_tokens=64,  # Generate up to 256 tokens
      echo=False,  # Whether to echo the prompt
    )
    response = output['choices'][0]['text']

    return response

def modelResp2(prompt):
    messages = [
        {"role": "user", "content": "Who are you?"},
        {"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
        {"role": "user", "content": f"{prompt}"}
    ]
    text = tokenizer2.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    output = llm2(
      text,
      max_tokens=64,  # Generate up to 256 tokens
      echo=False,  # Whether to echo the prompt
    )
    response = output['choices'][0]['text']

    return response
    
def modelResp3(prompt):
    messages = [
        {"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
        {"role": "user", "content": "Who are you?"},
        {"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
        {"role": "user", "content": f"{prompt}"}
    ]
    text = tokenizer3.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    output = llm2(
      text,
      max_tokens=64,  # Generate up to 256 tokens
      echo=False,  # Whether to echo the prompt
    )
    response = output['choices'][0]['text']

    return response
    
@app.post("/modelapi1")
async def modelApi(data: dict):
    prompt = data.get("prompt")
    response = modelResp1(prompt)
    return response

@app.post("/modelapi2")
async def modelApi(data: dict):
    prompt = data.get("prompt")
    response = modelResp2(prompt)
    return response
    
@app.post("/modelapi3")
async def modelApi1(data: dict):
    prompt = data.get("prompt")
    response = modelResp3(prompt)
    return response