ResponseModel / app.py
asv7j's picture
Update app.py
f63bd36 verified
raw
history blame
No virus
3.33 kB
from fastapi import FastAPI
import torch
import os
from llama_cpp import Llama
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cpu"
access_token = os.getenv("access_token")
tokenizer1 = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
tokenizer2 = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", token=access_token)
tokenizer3 = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
llm1 = Llama.from_pretrained(
repo_id="Qwen/Qwen2-1.5B-Instruct-GGUF",
filename="*q8_0.gguf",
verbose=False
)
llm2 = Llama.from_pretrained(
repo_id="NexaAIDev/gemma-2-2b-it-GGUF",
filename="*q4_K_S.gguf",
verbose=False
)
llm3 = Llama.from_pretrained(
repo_id="microsoft/Phi-3-mini-4k-instruct-gguf",
filename="*q4.gguf",
verbose=False
)
app = FastAPI()
@app.get("/")
async def read_root():
return {"Hello": "World!"}
def modelResp1(prompt):
messages = [
{"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
{"role": "user", "content": "Who are you?"},
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
{"role": "user", "content": f"{prompt}"}
]
text = tokenizer1.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
output = llm1(
text,
max_tokens=64, # Generate up to 256 tokens
echo=False, # Whether to echo the prompt
)
response = output['choices'][0]['text']
return response
def modelResp2(prompt):
messages = [
{"role": "user", "content": "Who are you?"},
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
{"role": "user", "content": f"{prompt}"}
]
text = tokenizer2.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
output = llm2(
text,
max_tokens=64, # Generate up to 256 tokens
echo=False, # Whether to echo the prompt
)
response = output['choices'][0]['text']
return response
def modelResp3(prompt):
messages = [
{"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
{"role": "user", "content": "Who are you?"},
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
{"role": "user", "content": f"{prompt}"}
]
text = tokenizer3.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
output = llm2(
text,
max_tokens=64, # Generate up to 256 tokens
echo=False, # Whether to echo the prompt
)
response = output['choices'][0]['text']
return response
@app.post("/modelapi1")
async def modelApi(data: dict):
prompt = data.get("prompt")
response = modelResp1(prompt)
return response
@app.post("/modelapi2")
async def modelApi(data: dict):
prompt = data.get("prompt")
response = modelResp2(prompt)
return response
@app.post("/modelapi3")
async def modelApi1(data: dict):
prompt = data.get("prompt")
response = modelResp3(prompt)
return response