import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer
import torch

# Load the model and tokenizer
model_name = "cognitivecomputations/TinyDolphin-2.8-1.1b"
model = AutoModelForCausalLM.from_pretrained(model_name)

# Try to load the tokenizer, with a fallback option
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
except ValueError:
    print("Failed to load AutoTokenizer. Falling back to LlamaTokenizer.")
    tokenizer = LlamaTokenizer.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_response(message, chat_history):
    # Prepare the input
    chat_history_text = ""
    for turn in chat_history:
        chat_history_text += f"Human: {turn[0]}\nAI: {turn[1]}\n"
    
    prompt = f"{chat_history_text}Human: {message}\nAI:"
    
    # Tokenize and generate
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the AI's response
    ai_response = response.split("AI:")[-1].strip()
    
    return ai_response

# Create the Gradio interface
iface = gr.ChatInterface(
    generate_response,
    chatbot=gr.Chatbot(height=300),
    textbox=gr.Textbox(placeholder="Type your message here...", container=False, scale=7),
    title="TinyDolphin-2.8-1.1b Chatbot",
    description="Chat with the TinyDolphin-2.8-1.1b model.",
    theme="soft",
    examples=["Tell me a short story", "What's the capital of France?", "Explain quantum computing"],
    cache_examples=False,
)

# Launch the interface
iface.launch()