import os import gradio as gr from huggingface_hub import InferenceClient # Retrieve the token from environment variable token = os.getenv("HF_TOKEN") client = InferenceClient( "meta-llama/Llama-3.2-3B-Instruct", token=token, ) def chat_with_llama(user_input): response = "" for message in client.chat_completion( messages=[{"role": "user", "content": user_input}], max_tokens=500, stream=True, ): response += message.choices[0].delta.content return response # Create a Gradio interface interface = gr.Interface( fn=chat_with_llama, inputs=gr.Textbox(label="Input Text", placeholder="Ask something..."), outputs="text", title="Chat with Llama 3", description="Enter your message to chat with Llama 3. Type your question or prompt below.", ) if __name__ == "__main__": interface.launch()