Yhhxhfh commited on
Commit
9fcde1e
1 Parent(s): 1a34e93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -76
app.py CHANGED
@@ -1,112 +1,70 @@
1
  from pydantic import BaseModel
2
  from llama_cpp import Llama
3
- import re
4
  import os
5
  import gradio as gr
6
  from dotenv import load_dotenv
7
  from fastapi import FastAPI, Request
8
  from fastapi.responses import JSONResponse
9
  import spaces
10
- import urllib3
11
  import random
12
 
13
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
14
-
15
  app = FastAPI()
16
  load_dotenv()
17
 
18
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
19
 
20
- global_data = {
21
- 'model': None,
22
- 'tokens': {
23
- 'eos': 'eos_token',
24
- 'pad': 'pad_token',
25
- 'padding': 'padding_token',
26
- 'unk': 'unk_token',
27
- 'bos': 'bos_token',
28
- 'sep': 'sep_token',
29
- 'cls': 'cls_token',
30
- 'mask': 'mask_token'
31
- }
32
- }
33
-
34
- model_configs = [
35
- {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf"},
36
- {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf"},
37
- {"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf"},
38
- {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf"},
39
- {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf"},
40
- {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf"},
41
- {"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf"},
42
- {"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf"},
43
- {"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf"},
44
- {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-instruct-q2_k.gguf"},
45
- {"repo_id": "Ffftdtd5dtft/codegemma-2b-IQ1_S-GGUF", "filename": "codegemma-2b-iq1_s-imat.gguf"},
46
- {"repo_id": "Ffftdtd5dtft/Phi-3.5-mini-instruct-Q2_K-GGUF", "filename": "phi-3.5-mini-instruct-q2_k.gguf"},
47
- {"repo_id": "Ffftdtd5dtft/TinyLlama-1.1B-Chat-v1.0-IQ1_S-GGUF", "filename": "tinyllama-1.1b-chat-v1.0-iq1_s-imat.gguf"},
48
- {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Minitron-8B-Base-IQ1_S-GGUF", "filename": "mistral-nemo-minitron-8b-base-iq1_s-imat.gguf"},
49
- {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf"}
50
- ]
51
-
52
  class ModelManager:
53
  def __init__(self):
54
- self.model = None
55
 
56
  def load_models(self):
57
  models = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  for config in model_configs:
59
- try:
60
- model = Llama.from_pretrained(repo_id=config['repo_id'], filename=config['filename'], use_auth_token=HUGGINGFACE_TOKEN)
61
- models.append(model)
62
- except Exception:
63
- continue
64
- self.model = models
65
 
66
  model_manager = ModelManager()
67
- model_manager.load_models()
68
- global_data['model'] = model_manager.model
69
 
70
  class ChatRequest(BaseModel):
71
  message: str
72
 
73
- def normalize_input(input_text):
74
- return input_text.strip()
75
-
76
- def remove_duplicates(text):
77
- text = re.sub(r'(Hello there, how are you\? \[/INST\]){2,}', 'Hello there, how are you? [/INST]', text)
78
- text = re.sub(r'(How are you\? \[/INST\]){2,}', 'How are you? [/INST]', text)
79
- text = text.replace('[/INST]', '')
80
- lines = text.split('\n')
81
- unique_lines = []
82
- seen_lines = set()
83
- for line in lines:
84
- if line not in seen_lines:
85
- unique_lines.append(line)
86
- seen_lines.add(line)
87
- return '\n'.join(unique_lines)
88
-
89
  @spaces.GPU()
90
  async def generate_combined_response(inputs):
91
  combined_response = ""
92
  top_p = round(random.uniform(0.01, 1.00), 2)
93
  top_k = random.randint(1, 100)
94
  temperature = round(random.uniform(0.01, 2.00), 2)
95
- for model in global_data['model']:
96
- try:
97
- response = model(inputs, top_p=top_p, top_k=top_k, temperature=temperature)
98
- combined_response += remove_duplicates(response['choices'][0]['text']) + "\n"
99
- except Exception:
100
- continue
101
  return combined_response
102
 
103
  async def process_message(message):
104
- inputs = normalize_input(message)
105
  combined_response = await generate_combined_response(inputs)
106
- formatted_response = ""
107
- for line in combined_response.split("\n"):
108
- formatted_response += f"{line}\n\n"
109
- return formatted_response
110
 
111
  @app.post("/generate_multimodel")
112
  async def api_generate_multimodel(request: Request):
@@ -119,10 +77,9 @@ iface = gr.Interface(
119
  fn=process_message,
120
  inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
121
  outputs=gr.Markdown(),
122
- title="Multi-Model LLM API",
123
- description="Enter a message and get responses from a unified model.",
124
  )
125
 
126
  if __name__ == "__main__":
127
- port = int(os.environ.get("PORT", 7860))
128
- iface.launch(server_port=port)
 
1
  from pydantic import BaseModel
2
  from llama_cpp import Llama
 
3
  import os
4
  import gradio as gr
5
  from dotenv import load_dotenv
6
  from fastapi import FastAPI, Request
7
  from fastapi.responses import JSONResponse
8
  import spaces
9
+ import asyncio
10
  import random
11
 
 
 
12
  app = FastAPI()
13
  load_dotenv()
14
 
15
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  class ModelManager:
18
  def __init__(self):
19
+ self.model = self.load_models()
20
 
21
  def load_models(self):
22
  models = []
23
+ model_configs = [
24
+ {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf"},
25
+ {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf"},
26
+ {"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf"},
27
+ {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf"},
28
+ {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf"},
29
+ {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf"},
30
+ {"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf"},
31
+ {"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf"},
32
+ {"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf"},
33
+ {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-instruct-q2_k.gguf"},
34
+ {"repo_id": "Ffftdtd5dtft/codegemma-2b-IQ1_S-GGUF", "filename": "codegemma-2b-iq1_s.gguf"},
35
+ {"repo_id": "Ffftdtd5dtft/Phi-3.5-mini-instruct-Q2_K-GGUF", "filename": "phi-3.5-mini-instruct-q2_k.gguf"},
36
+ {"repo_id": "Ffftdtd5dtft/TinyLlama-1.1B-Chat-v1.0-IQ1_S-GGUF", "filename": "tinyllama-1.1b-chat-v1.0-iq1_s.gguf"},
37
+ {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Minitron-8B-Base-IQ1_S-GGUF", "filename": "mistral-nemo-minitron-8b-base-iq1_s.gguf"},
38
+ {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf"}
39
+ ]
40
  for config in model_configs:
41
+ model = Llama.from_pretrained(repo_id=config['repo_id'], filename=config['filename'], use_auth_token=HUGGINGFACE_TOKEN)
42
+ models.append(model)
43
+ return models
 
 
 
44
 
45
  model_manager = ModelManager()
 
 
46
 
47
  class ChatRequest(BaseModel):
48
  message: str
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  @spaces.GPU()
51
  async def generate_combined_response(inputs):
52
  combined_response = ""
53
  top_p = round(random.uniform(0.01, 1.00), 2)
54
  top_k = random.randint(1, 100)
55
  temperature = round(random.uniform(0.01, 2.00), 2)
56
+ tasks = []
57
+ for model in model_manager.model:
58
+ tasks.append(model(inputs, top_p=top_p, top_k=top_k, temperature=temperature))
59
+ responses = await asyncio.gather(*tasks)
60
+ for response in responses:
61
+ combined_response += response['choices'][0]['text'] + "\n"
62
  return combined_response
63
 
64
  async def process_message(message):
65
+ inputs = message.strip()
66
  combined_response = await generate_combined_response(inputs)
67
+ return combined_response
 
 
 
68
 
69
  @app.post("/generate_multimodel")
70
  async def api_generate_multimodel(request: Request):
 
77
  fn=process_message,
78
  inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
79
  outputs=gr.Markdown(),
80
+ title="Unified Multi-Model API",
81
+ description="Enter a message to get responses from a unified model."
82
  )
83
 
84
  if __name__ == "__main__":
85
+ iface.launch()