File size: 1,645 Bytes
ddef98f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2398722
ddef98f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import gradio as gr
import spaces
import llama_cpp
import llama_cpp.llama_tokenizer

import gradio as gr

llama = llama_cpp.Llama.from_pretrained(
    repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
    filename="*q8_0.gguf",
    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
    verbose=False
)

model = "gpt-3.5-turbo"

def predict(message, history):
    messages = []

    for user_message, assistant_message in history:
        messages.append({"role": "user", "content": user_message})
        messages.append({"role": "assistant", "content": assistant_message})
    
    messages.append({"role": "user", "content": message})

    response = llama.create_chat_completion_openai_v1(
        model=model,
        messages=messages,
        stream=True
    )

    text = ""
    for chunk in response:
        content = chunk.choices[0].delta.content
        if content:
            text += content
            yield text

chat_interface = gr.ChatInterface(
    fn=predict,
    chatbot=gr.Chatbot(height=400),
    textbox=gr.Textbox(placeholder="Enter message here", container=False, scale=7),
    title="Chat with AI Model",
    description="""
    Custom description based on the new GGUF model capabilities and features.
    """,
    theme="soft",
    additional_inputs=[
        gr.Textbox(value="Hello!", label="System Prompt", placeholder="Enter a system prompt"),
        gr.Slider(minimum=50, maximum=1000, step=50, value=150, label="Max New Tokens"),
        gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.7, label="Temperature")
    ],
    allow_flagging="never"
)

chat_interface.launch()