jordigonzm's picture
Update app.py
6674f02 verified
raw
history blame contribute delete
No virus
2.64 kB
import gradio as gr
import spaces
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
def get_device():
"""Devuelve 'cuda' si CUDA está disponible (GPU disponible), de lo contrario devuelve 'cpu'."""
return "cuda" if torch.cuda.is_available() else "cpu"
model_name = "microsoft/Phi-3-mini-128k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True)
# Obtener el dispositivo adecuado según la disponibilidad de GPU
device = get_device()
model.to(device)
# Aplicar cuantificación dinámica solo si el dispositivo es CPU
if device == 'cpu':
model = torch.quantization.quantize_dynamic(
model, {torch.nn.Linear}, dtype=torch.qint8
)
pipeline = transformers.pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device=device,
trust_remote_code=True
)
@spaces.GPU
def chat_function(message, history, system_prompt, max_new_tokens, temperature):
prompt = system_prompt
for msg in history:
prompt += f'{msg["role"]}: {msg["content"]}\n'
prompt += f'user: {message}\n'
temp = temperature + 0.1
outputs = pipeline(
prompt,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temp,
top_p=0.9,
)
generated_text = outputs[0]['generated_text']
new_text = generated_text[len(prompt):]
return new_text
chat_interface = gr.ChatInterface(
fn=chat_function,
chatbot=gr.Chatbot(height=400),
textbox=gr.Textbox(placeholder="Enter message here", container=False, scale=7),
title="Chat with AI Model",
description="""
The Phi-3-Mini-128K-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets. This dataset includes both synthetic data and filtered publicly available website data, with an emphasis on high-quality and reasoning-dense properties. The model belongs to the Phi-3 family with the Mini version in two variants 4K and 128K which is the context length (in tokens) that it can support.
https://huggingface.co/microsoft/Phi-3-mini-128k-instruct
""",
theme="soft",
additional_inputs=[
gr.Textbox(value="Hello!", label="System Prompt", placeholder="Enter a system prompt"),
gr.Slider(minimum=50, maximum=1000, step=50, value=150, label="Max New Tokens"),
gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.7, label="Temperature")
],
allow_flagging="never"
)
chat_interface.launch()