import gradio as gr
import spaces
import llama_cpp
import llama_cpp.llama_tokenizer

import gradio as gr

llama = llama_cpp.Llama.from_pretrained(
    repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
    filename="*q8_0.gguf",
    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
    verbose=False
)

model = "gpt-3.5-turbo"

def predict(message, history):
    messages = []

    for user_message, assistant_message in history:
        messages.append({"role": "user", "content": user_message})
        messages.append({"role": "assistant", "content": assistant_message})
    
    messages.append({"role": "user", "content": message})

    response = llama.create_chat_completion_openai_v1(
        model=model,
        messages=messages,
        stream=True
    )

    text = ""
    for chunk in response:
        content = chunk.choices[0].delta.content
        if content:
            text += content
            yield text

chat_interface = gr.ChatInterface(
    fn=chat_function,
    chatbot=gr.Chatbot(height=400),
    textbox=gr.Textbox(placeholder="Enter message here", container=False, scale=7),
    title="Chat with AI Model",
    description="""
    Custom description based on the new GGUF model capabilities and features.
    """,
    theme="soft",
    additional_inputs=[
        gr.Textbox(value="Hello!", label="System Prompt", placeholder="Enter a system prompt"),
        gr.Slider(minimum=50, maximum=1000, step=50, value=150, label="Max New Tokens"),
        gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.7, label="Temperature")
    ],
    allow_flagging="never"
)

chat_interface.launch()