import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch # Optimized Loading: Load in half precision if CUDA is available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load the model and tokenizer model_name = "Sephfox/A.I.R.R" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" ).to(device) def generate_response(prompt): inputs = tokenizer(prompt, return_tensors="pt").to(device) outputs = model.generate( **inputs, max_length=200, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id, do_sample=True, temperature=0.7, top_p=0.9 ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response # Create Gradio chat interface def chat_bot(user_input, history=[]): bot_response = generate_response(user_input) history.append((user_input, bot_response)) return history, history with gr.Blocks() as demo: gr.Markdown("# A.I.R.R Chatbot (Optimized)") chatbot = gr.Chatbot(label="Chat with A.I.R.R") user_input = gr.Textbox(show_label=False, placeholder="Type your message here...") state = gr.State([]) submit_button = gr.Button("Send") submit_button.click( fn=chat_bot, inputs=[user_input, state], outputs=[chatbot, state] ) demo.launch()