import gradio as gr import spaces import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer def get_device(): """Devuelve 'cuda' si CUDA está disponible (GPU disponible), de lo contrario devuelve 'cpu'.""" return "cuda" if torch.cuda.is_available() else "cpu" model_name = "microsoft/Phi-3-mini-128k-instruct" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True) # Obtener el dispositivo adecuado según la disponibilidad de GPU device = get_device() #model = torch.quantization.quantize_dynamic( # model, {torch.nn.Linear}, dtype=torch.qint8 #) pipeline = transformers.pipeline( "text-generation", model=model, tokenizer=tokenizer, device=device, trust_remote_code=True ) @spaces.GPU def chat_function(message, history, system_prompt, max_new_tokens, temperature): prompt = system_prompt for msg in history: prompt += f'{msg["role"]}: {msg["content"]}\n' prompt += f'user: {message}\n' temp = temperature + 0.1 outputs = pipeline( prompt, max_new_tokens=max_new_tokens, do_sample=True, temperature=temp, top_p=0.9, ) generated_text = outputs[0]['generated_text'] new_text = generated_text[len(prompt):] return new_text chat_interface = gr.ChatInterface( fn=chat_function, chatbot=gr.Chatbot(height=400), textbox=gr.Textbox(placeholder="Enter message here", container=False, scale=7), title="Chat with AI Model", description=""" The Phi-3-Mini-128K-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets. This dataset includes both synthetic data and filtered publicly available website data, with an emphasis on high-quality and reasoning-dense properties. The model belongs to the Phi-3 family with the Mini version in two variants 4K and 128K which is the context length (in tokens) that it can support. """, theme="soft", additional_inputs=[ gr.Textbox(value="Hello!", label="System Prompt", placeholder="Enter a system prompt"), gr.Slider(minimum=50, maximum=1000, step=50, value=150, label="Max New Tokens"), gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.7, label="Temperature") ], allow_screenshots=False, allow_flagging="never" ) chat_interface.launch()