import os
import gradio as gr
from huggingface_hub import InferenceClient

# Retrieve the token from environment variable
token = os.getenv("HF_TOKEN")

client = InferenceClient(
    "meta-llama/Llama-3.2-3B-Instruct",
    token=token,
)

def chat_with_llama(user_input):
    response = ""
    for message in client.chat_completion(
        messages=[{"role": "user", "content": user_input}],
        max_tokens=500,
        stream=True,
    ):
        response += message.choices[0].delta.content
    return response

# Create a Gradio interface
interface = gr.Interface(
    fn=chat_with_llama,
    inputs=gr.Textbox(label="Input Text", placeholder="Ask something..."),
    outputs="text",
    title="Chat with Llama 3",
    description="Enter your message to chat with Llama 3. Type your question or prompt below.",
)

if __name__ == "__main__":
    interface.launch()