import gradio as gr import spaces import llama_cpp import llama_cpp.llama_tokenizer import gradio as gr llama = llama_cpp.Llama.from_pretrained( repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF", filename="*q8_0.gguf", tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"), verbose=False ) model = "gpt-3.5-turbo" def predict(message, history): messages = [] for user_message, assistant_message in history: messages.append({"role": "user", "content": user_message}) messages.append({"role": "assistant", "content": assistant_message}) messages.append({"role": "user", "content": message}) response = llama.create_chat_completion_openai_v1( model=model, messages=messages, stream=True ) text = "" for chunk in response: content = chunk.choices[0].delta.content if content: text += content yield text chat_interface = gr.ChatInterface( fn=chat_function, chatbot=gr.Chatbot(height=400), textbox=gr.Textbox(placeholder="Enter message here", container=False, scale=7), title="Chat with AI Model", description=""" Custom description based on the new GGUF model capabilities and features. """, theme="soft", additional_inputs=[ gr.Textbox(value="Hello!", label="System Prompt", placeholder="Enter a system prompt"), gr.Slider(minimum=50, maximum=1000, step=50, value=150, label="Max New Tokens"), gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.7, label="Temperature") ], allow_flagging="never" ) chat_interface.launch()