import gradio as gr
import spaces
import llama_cpp
import llama_cpp.llama_tokenizer

import gradio as gr

REPO_ID = "jordigonzm/gemma-2b-it"
MODEL_FILENAME = "gemma-2b-it_v1p1-Q4_K_M.gguf"

llama = llama_cpp.Llama.from_pretrained(
    repo_id=REPO_ID,
    filename=MODEL_FILENAME,
    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(REPO_ID),
    verbose=False
)

model = "gpt-3.5-turbo"

def predict(message, history):
    messages = []
        
    # Añadir el nuevo mensaje del usuario.
    messages.append({"role": "user", "content": message})
    
    # Crear la respuesta del modelo.
    response = llama.create_chat_completion_openai_v1(
        model=model,
        messages=messages,
        stream=True
    )
    
    # Recopilar y devolver la respuesta.
    text = ""
    for chunk in response:
        content = chunk.choices[0].delta.content
        if content:
            text += content
            yield text

chat_interface = gr.Interface(
    fn=predict,
    inputs=[
        gr.Textbox(lines=6, placeholder="Ask"),
    ],
    outputs="text",
    title="Chat with Gemma-2B-it Model, LlamaCPP",
    description=" \
    Welcome to the Gemma-2B-IT LlamaCPP Assistant, an interactive platform powered by the state-of-the-art \
    Gemma-2B-it language model, seamlessly integrated with Llama-CPP-Python.<br>" + MODEL_FILENAME,
    theme="soft",
)

chat_interface.launch()