asv7j commited on
Commit
091d4d8
1 Parent(s): 8d9c7e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -32
app.py CHANGED
@@ -1,19 +1,32 @@
1
  from fastapi import FastAPI
2
  import torch
3
-
 
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
  device = "cpu"
6
 
7
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 
 
 
 
 
 
 
 
 
 
8
 
9
- model = AutoModelForCausalLM.from_pretrained(
10
- "Qwen/Qwen2-0.5B-Instruct",
11
- device_map="auto"
 
12
  )
13
 
14
- model1 = AutoModelForCausalLM.from_pretrained(
15
- "Qwen/Qwen2-1.5B-Instruct",
16
- device_map="auto"
 
17
  )
18
 
19
  app = FastAPI()
@@ -22,7 +35,7 @@ app = FastAPI()
22
  async def read_root():
23
  return {"Hello": "World!"}
24
 
25
- def modelResp(prompt):
26
  messages = [
27
  {"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
28
  {"role": "user", "content": "Who are you?"},
@@ -34,20 +47,16 @@ def modelResp(prompt):
34
  tokenize=False,
35
  add_generation_prompt=True
36
  )
37
- model_inputs = tokenizer([text], return_tensors="pt").to(device)
38
- generated_ids = model.generate(
39
- model_inputs.input_ids,
40
- max_new_tokens=64,
41
- do_sample=True
42
  )
43
- generated_ids = [
44
- output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
45
- ]
46
- response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
47
 
48
  return response
49
 
50
- def modelResp1(prompt):
51
  messages = [
52
  {"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
53
  {"role": "user", "content": "Who are you?"},
@@ -59,27 +68,50 @@ def modelResp1(prompt):
59
  tokenize=False,
60
  add_generation_prompt=True
61
  )
62
- model_inputs = tokenizer([text], return_tensors="pt").to(device)
63
- generated_ids = model1.generate(
64
- model_inputs.input_ids,
65
- max_new_tokens=64,
66
- do_sample=True
67
  )
68
- generated_ids = [
69
- output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
70
- ]
71
- response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
72
 
73
  return response
74
 
75
- @app.post("/modelapi")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  async def modelApi(data: dict):
77
  prompt = data.get("prompt")
78
- response = modelResp(prompt)
79
  return response
80
 
81
- @app.post("/modelapi1")
 
 
 
 
 
 
82
  async def modelApi1(data: dict):
83
  prompt = data.get("prompt")
84
- response = modelResp1(prompt)
85
  return response
 
1
  from fastapi import FastAPI
2
  import torch
3
+ import os
4
+ from llama_cpp import Llama
5
  from transformers import AutoModelForCausalLM, AutoTokenizer
6
  device = "cpu"
7
 
8
+ access_token = os.getenv("access_token")
9
+
10
+ tokenizer1 = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
11
+ tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", token=access_token)
12
+ tokenizer3 = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
13
+
14
+ llm1 = Llama.from_pretrained(
15
+ repo_id="Qwen/Qwen2-1.5B-Instruct-GGUF",
16
+ filename="*q8_0.gguf",
17
+ verbose=False
18
+ )
19
 
20
+ llm2 = Llama.from_pretrained(
21
+ repo_id="NexaAIDev/gemma-2-2b-it-GGUF",
22
+ filename="*q4_K_S.gguf",
23
+ verbose=False
24
  )
25
 
26
+ llm3 = Llama.from_pretrained(
27
+ repo_id="microsoft/Phi-3-mini-4k-instruct-gguf",
28
+ filename="*q4.gguf",
29
+ verbose=False
30
  )
31
 
32
  app = FastAPI()
 
35
  async def read_root():
36
  return {"Hello": "World!"}
37
 
38
+ def modelResp1(prompt):
39
  messages = [
40
  {"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
41
  {"role": "user", "content": "Who are you?"},
 
47
  tokenize=False,
48
  add_generation_prompt=True
49
  )
50
+ output = llm1(
51
+ text,
52
+ max_tokens=64, # Generate up to 256 tokens
53
+ echo=False, # Whether to echo the prompt
 
54
  )
55
+ response = output['choices'][0]['text']
 
 
 
56
 
57
  return response
58
 
59
+ def modelResp2(prompt):
60
  messages = [
61
  {"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
62
  {"role": "user", "content": "Who are you?"},
 
68
  tokenize=False,
69
  add_generation_prompt=True
70
  )
71
+ output = llm2(
72
+ text,
73
+ max_tokens=64, # Generate up to 256 tokens
74
+ echo=False, # Whether to echo the prompt
 
75
  )
76
+ response = output['choices'][0]['text']
 
 
 
77
 
78
  return response
79
 
80
+ def modelResp3(prompt):
81
+ messages = [
82
+ {"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
83
+ {"role": "user", "content": "Who are you?"},
84
+ {"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
85
+ {"role": "user", "content": f"{prompt}"}
86
+ ]
87
+ text = tokenizer.apply_chat_template(
88
+ messages,
89
+ tokenize=False,
90
+ add_generation_prompt=True
91
+ )
92
+ output = llm2(
93
+ text,
94
+ max_tokens=64, # Generate up to 256 tokens
95
+ echo=False, # Whether to echo the prompt
96
+ )
97
+ response = output['choices'][0]['text']
98
+
99
+ return response
100
+
101
+ @app.post("/modelapi1")
102
  async def modelApi(data: dict):
103
  prompt = data.get("prompt")
104
+ response = modelResp1(prompt)
105
  return response
106
 
107
+ @app.post("/modelapi2")
108
+ async def modelApi(data: dict):
109
+ prompt = data.get("prompt")
110
+ response = modelResp2(prompt)
111
+ return response
112
+
113
+ @app.post("/modelapi3")
114
  async def modelApi1(data: dict):
115
  prompt = data.get("prompt")
116
+ response = modelResp3(prompt)
117
  return response