from fastapi import FastAPI import torch import os from llama_cpp import Llama from transformers import AutoModelForCausalLM, AutoTokenizer device = "cpu" access_token = os.getenv("access_token") tokenizer1 = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct") tokenizer2 = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", token=access_token) tokenizer3 = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") llm1 = Llama.from_pretrained( repo_id="Qwen/Qwen2-1.5B-Instruct-GGUF", filename="*q8_0.gguf", verbose=False ) llm2 = Llama.from_pretrained( repo_id="NexaAIDev/gemma-2-2b-it-GGUF", filename="*q4_K_S.gguf", verbose=False ) llm3 = Llama.from_pretrained( repo_id="microsoft/Phi-3-mini-4k-instruct-gguf", filename="*q4.gguf", verbose=False ) app = FastAPI() @app.get("/") async def read_root(): return {"Hello": "World!"} def modelResp1(prompt): messages = [ {"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."}, {"role": "user", "content": "Who are you?"}, {"role": "assistant", "content": "I am Sia, a small language model created by Sushma."}, {"role": "user", "content": f"{prompt}"} ] text = tokenizer1.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) output = llm1( text, max_tokens=64, # Generate up to 256 tokens echo=False, # Whether to echo the prompt ) response = output['choices'][0]['text'] return response def modelResp2(prompt): messages = [ {"role": "user", "content": "Who are you?"}, {"role": "assistant", "content": "I am Sia, a small language model created by Sushma."}, {"role": "user", "content": f"{prompt}"} ] text = tokenizer2.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) output = llm2( text, max_tokens=64, # Generate up to 256 tokens echo=False, # Whether to echo the prompt ) response = output['choices'][0]['text'] return response def modelResp3(prompt): messages = [ {"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."}, {"role": "user", "content": "Who are you?"}, {"role": "assistant", "content": "I am Sia, a small language model created by Sushma."}, {"role": "user", "content": f"{prompt}"} ] text = tokenizer3.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) output = llm2( text, max_tokens=64, # Generate up to 256 tokens echo=False, # Whether to echo the prompt ) response = output['choices'][0]['text'] return response @app.post("/modelapi1") async def modelApi(data: dict): prompt = data.get("prompt") response = modelResp1(prompt) return response @app.post("/modelapi2") async def modelApi(data: dict): prompt = data.get("prompt") response = modelResp2(prompt) return response @app.post("/modelapi3") async def modelApi1(data: dict): prompt = data.get("prompt") response = modelResp3(prompt) return response