import streamlit as st from transformers import AutoTokenizer, AutoModelForCausalLM import time import torch from pynvml import * # needs restart of IDE to install, from nvidia-ml-py3 # Get device DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Streamlit setup st.title("Telco Chat Bot") st.page_link("https://github.com/Ali-maatouk/Tele-LLMs", label="Tele-LLMs backend", icon="📱") # Add text giving credit col1, col2 = st.columns(2) if 'conversation' not in st.session_state: st.session_state.conversation = [] user_input = st.text_input("You:", "") # user input # Resource monitoring: def print_gpu_utilization(): nvmlInit() handle = nvmlDeviceGetHandleByIndex(0) info = nvmlDeviceGetMemoryInfo(handle) print(f"GPU memory occupied: {info.used//1024**2} MB.") # Model functions: @st.cache_resource(show_spinner=False) def load_model(): """ Load model from Hugging face.""" print_gpu_utilization() success_placeholder = st.empty() with st.spinner("Loading model... please wait"): #model_name = "AliMaatouk/TinyLlama-1.1B-Tele" # Replace with the correct model name #model_name = "AliMaatouk/LLama-3-8B-Tele-it" model_name = "AliMaatouk/Gemma-2B-Tele" if str(DEVICE) == "cuda:0": # may not need this, need to test on CPU if device map is okay anyway tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto", device_map="auto") else: tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto") model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE) success_placeholder.success("Model loaded successfully!", icon="🔥") time.sleep(2) success_placeholder.empty() return model, tokenizer def generate_response(user_input): """ Query the model. """ success_placeholder = st.empty() with st.spinner("Thinking..."): inputs = tokenizer(user_input, return_tensors="pt").to(DEVICE) #outputs = model.generate(**inputs, max_length=1000, pad_token_id=tokenizer.eos_token_id) outputs = model.generate(**inputs, max_new_tokens=750) print_gpu_utilization() generated_tokens = outputs[0, len(inputs['input_ids'][0]):] success_placeholder.success("Response generated!", icon="✅") time.sleep(2) success_placeholder.empty() text = tokenizer.decode(generated_tokens, skip_special_tokens=True) return text # RUNTIME EVENTS: # Load model and tokenizer model, tokenizer = load_model() # Submit button to send the query with col1: if st.button("send"): if user_input: st.session_state.conversation.append({"role": "user", "content": user_input}) # Querying model # Add a loading spinner during model loading response = generate_response(user_input) # Display bot response st.session_state.conversation.append({"role": "bot", "content": response}) # Clear button to reset with col2: if st.button("clear chat"): if user_input: st.session_state.conversation = [] # Display conversation history for chat in st.session_state.conversation: if chat['role'] == 'user': st.write(f"You: {chat['content']}") else: st.write(f"Bot: {chat['content']}")