Spaces:
Runtime error
Runtime error
import gradio as gr | |
import spaces | |
import torch | |
import transformers | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
def get_device(): | |
"""Devuelve 'cuda' si CUDA está disponible (GPU disponible), de lo contrario devuelve 'cpu'.""" | |
return "cuda" if torch.cuda.is_available() else "cpu" | |
model_name = "microsoft/Phi-3-mini-128k-instruct" | |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True) | |
# Obtener el dispositivo adecuado según la disponibilidad de GPU | |
device = get_device() | |
model.to(device) | |
# Aplicar cuantificación dinámica solo si el dispositivo es CPU | |
if device == 'cpu': | |
model = torch.quantization.quantize_dynamic( | |
model, {torch.nn.Linear}, dtype=torch.qint8 | |
) | |
pipeline = transformers.pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
device=device, | |
trust_remote_code=True | |
) | |
def chat_function(message, history, system_prompt, max_new_tokens, temperature): | |
prompt = system_prompt | |
for msg in history: | |
prompt += f'{msg["role"]}: {msg["content"]}\n' | |
prompt += f'user: {message}\n' | |
temp = temperature + 0.1 | |
outputs = pipeline( | |
prompt, | |
max_new_tokens=max_new_tokens, | |
do_sample=True, | |
temperature=temp, | |
top_p=0.9, | |
) | |
generated_text = outputs[0]['generated_text'] | |
new_text = generated_text[len(prompt):] | |
return new_text | |
chat_interface = gr.ChatInterface( | |
fn=chat_function, | |
chatbot=gr.Chatbot(height=400), | |
textbox=gr.Textbox(placeholder="Enter message here", container=False, scale=7), | |
title="Chat with AI Model", | |
description=""" | |
The Phi-3-Mini-128K-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets. This dataset includes both synthetic data and filtered publicly available website data, with an emphasis on high-quality and reasoning-dense properties. The model belongs to the Phi-3 family with the Mini version in two variants 4K and 128K which is the context length (in tokens) that it can support. | |
https://huggingface.co/microsoft/Phi-3-mini-128k-instruct | |
""", | |
theme="soft", | |
additional_inputs=[ | |
gr.Textbox(value="Hello!", label="System Prompt", placeholder="Enter a system prompt"), | |
gr.Slider(minimum=50, maximum=1000, step=50, value=150, label="Max New Tokens"), | |
gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.7, label="Temperature") | |
], | |
allow_flagging="never" | |
) | |
chat_interface.launch() | |