CaioXapelaum commited on
Commit
58b423a
1 Parent(s): 118cb38

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +181 -0
app.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use TPU v2
2
+ !pip install gradio spaces llama-cpp-python llama-cpp-agent huggingface_hub
3
+ import spaces
4
+ import subprocess
5
+ from llama_cpp import Llama
6
+ from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
7
+ from llama_cpp_agent.providers import LlamaCppPythonProvider
8
+ from llama_cpp_agent.chat_history import BasicChatHistory
9
+ from llama_cpp_agent.chat_history.messages import Roles
10
+ import gradio as gr
11
+ from huggingface_hub import hf_hub_download
12
+ from google.colab import userdata
13
+
14
+ huggingface_token = userdata.get("HF_TOKEN")
15
+
16
+ # Download the Meta-Llama-3.1-8B-Instruct model
17
+ hf_hub_download(
18
+ repo_id="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
19
+ filename="Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf",
20
+ local_dir="./models",
21
+ token=huggingface_token
22
+ )
23
+
24
+ hf_hub_download(
25
+ repo_id="bartowski/Mistral-Nemo-Instruct-2407-GGUF",
26
+ filename="Mistral-Nemo-Instruct-2407-Q5_K_M.gguf",
27
+ local_dir="./models",
28
+ token=huggingface_token
29
+ )
30
+
31
+ hf_hub_download(
32
+ repo_id="bartowski/gemma-2-2b-it-GGUF",
33
+ filename="gemma-2-2b-it-Q6_K_L.gguf",
34
+ local_dir="./models",
35
+ token=huggingface_token
36
+ )
37
+
38
+ hf_hub_download(
39
+ repo_id="bartowski/openchat-3.6-8b-20240522-GGUF",
40
+ filename="openchat-3.6-8b-20240522-Q6_K.gguf",
41
+ local_dir="./models",
42
+ token=huggingface_token
43
+ )
44
+
45
+ hf_hub_download(
46
+ repo_id="bartowski/Llama-3-Groq-8B-Tool-Use-GGUF",
47
+ filename="Llama-3-Groq-8B-Tool-Use-Q6_K.gguf",
48
+ local_dir="./models",
49
+ token=huggingface_token
50
+ )
51
+
52
+
53
+ llm = None
54
+ llm_model = None
55
+
56
+ @spaces.GPU(duration=50)
57
+ def respond(
58
+ message,
59
+ history: list[tuple[str, str]],
60
+ model,
61
+ system_message,
62
+ max_tokens,
63
+ temperature,
64
+ top_p,
65
+ top_k,
66
+ repeat_penalty,
67
+ ):
68
+ chat_template = MessagesFormatterType.GEMMA_2
69
+
70
+ global llm
71
+ global llm_model
72
+
73
+ # Load model only if it's not already loaded or if a new model is selected
74
+ if llm is None or llm_model != model:
75
+ try:
76
+ llm = Llama(
77
+ model_path=f"models/{model}",
78
+ flash_attn=True,
79
+ n_gpu_layers=81, # Adjust based on available GPU resources
80
+ n_batch=1024,
81
+ n_ctx=8192,
82
+ )
83
+ llm_model = model
84
+ except Exception as e:
85
+ return f"Error loading model: {str(e)}"
86
+
87
+ provider = LlamaCppPythonProvider(llm)
88
+
89
+ agent = LlamaCppAgent(
90
+ provider,
91
+ system_prompt=f"{system_message}",
92
+ predefined_messages_formatter_type=chat_template,
93
+ debug_output=True
94
+ )
95
+
96
+ settings = provider.get_provider_default_settings()
97
+ settings.temperature = temperature
98
+ settings.top_k = top_k
99
+ settings.top_p = top_p
100
+ settings.max_tokens = max_tokens
101
+ settings.repeat_penalty = repeat_penalty
102
+ settings.stream = True
103
+
104
+ messages = BasicChatHistory()
105
+
106
+ # Add user and assistant messages to the history
107
+ for msn in history:
108
+ user = {'role': Roles.user, 'content': msn[0]}
109
+ assistant = {'role': Roles.assistant, 'content': msn[1]}
110
+ messages.add_message(user)
111
+ messages.add_message(assistant)
112
+
113
+ # Stream the response
114
+ try:
115
+ stream = agent.get_chat_response(
116
+ message,
117
+ llm_sampling_settings=settings,
118
+ chat_history=messages,
119
+ returns_streaming_generator=True,
120
+ print_output=False
121
+ )
122
+
123
+ outputs = ""
124
+ for output in stream:
125
+ outputs += output
126
+ yield outputs
127
+ except Exception as e:
128
+ yield f"Error during response generation: {str(e)}"
129
+
130
+ demo = gr.ChatInterface(
131
+ respond,
132
+ additional_inputs=[
133
+ gr.Dropdown([
134
+ 'Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf',
135
+ 'Mistral-Nemo-Instruct-2407-Q5_K_M.gguf',
136
+ 'gemma-2-2b-it-Q6_K_L.gguf',
137
+ 'openchat-3.6-8b-20240522-Q6_K.gguf',
138
+ 'Llama-3-Groq-8B-Tool-Use-Q6_K.gguf'
139
+ ],
140
+ value="gemma-2-2b-it-Q6_K_L.gguf",
141
+ label="Model"
142
+ ),
143
+ gr.Textbox(value="You are a helpful assistant.", label="System message"),
144
+ gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
145
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
146
+ gr.Slider(
147
+ minimum=0.1,
148
+ maximum=1.0,
149
+ value=0.95,
150
+ step=0.05,
151
+ label="Top-p",
152
+ ),
153
+ gr.Slider(
154
+ minimum=0,
155
+ maximum=100,
156
+ value=40,
157
+ step=1,
158
+ label="Top-k",
159
+ ),
160
+ gr.Slider(
161
+ minimum=0.0,
162
+ maximum=2.0,
163
+ value=1.1,
164
+ step=0.1,
165
+ label="Repetition penalty",
166
+ ),
167
+ ],
168
+ retry_btn="Retry",
169
+ undo_btn="Undo",
170
+ clear_btn="Clear",
171
+ submit_btn="Send",
172
+ title="Chat with lots of Models and LLMs using llama.cpp",
173
+ chatbot=gr.Chatbot(
174
+ scale=1,
175
+ likeable=False,
176
+ show_copy_button=True
177
+ )
178
+ )
179
+
180
+ if __name__ == "__main__":
181
+ demo.launch()