import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Optimized Loading: Load in half precision if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and tokenizer
model_name = "Sephfox/A.I.R.R"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
).to(device)

def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_length=200,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Create Gradio chat interface
def chat_bot(user_input, history=[]):
    bot_response = generate_response(user_input)
    history.append((user_input, bot_response))
    return history, history

with gr.Blocks() as demo:
    gr.Markdown("# A.I.R.R Chatbot (Optimized)")
    chatbot = gr.Chatbot(label="Chat with A.I.R.R")
    user_input = gr.Textbox(show_label=False, placeholder="Type your message here...")
    state = gr.State([])
    submit_button = gr.Button("Send")

    submit_button.click(
        fn=chat_bot,
        inputs=[user_input, state],
        outputs=[chatbot, state]
    )

demo.launch()