import gradio as gr import spaces import llama_cpp import llama_cpp.llama_tokenizer import gradio as gr REPO_ID = "jordigonzm/gemma-2b-it" MODEL_FILENAME = "gemma-2b-it_v1p1-Q4_K_M.gguf" llama = llama_cpp.Llama.from_pretrained( repo_id=REPO_ID, filename=MODEL_FILENAME, tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(REPO_ID), verbose=False ) model = "gpt-3.5-turbo" def predict(message, history): messages = [] # AƱadir el nuevo mensaje del usuario. messages.append({"role": "user", "content": message}) # Crear la respuesta del modelo. response = llama.create_chat_completion_openai_v1( model=model, messages=messages, stream=True ) # Recopilar y devolver la respuesta. text = "" for chunk in response: content = chunk.choices[0].delta.content if content: text += content yield text chat_interface = gr.Interface( fn=predict, inputs=[ gr.Textbox(lines=6, placeholder="Ask"), ], outputs="text", title="Chat with Gemma-2B-it Model, LlamaCPP", description=" \ Welcome to the Gemma-2B-IT LlamaCPP Assistant, an interactive platform powered by the state-of-the-art \ Gemma-2B-it language model, seamlessly integrated with Llama-CPP-Python.
" + MODEL_FILENAME, theme="soft", ) chat_interface.launch()