Spaces:

thebigoed
/

LLMTesting

Running

File size: 3,328 Bytes

f6ffde9
8aec0fe
 
c41146d
 
 
 
 
f6ffde9
8aec0fe
 
 
 
 
 
 
 
 
c41146d
 
 
 
 
 
 
8aec0fe
 
 
 
 
c41146d
8aec0fe
 
c41146d
 
 
 
 
 
 
 
 
8aec0fe
 
 
 
 
 
 
 
 
c41146d
8aec0fe
c41146d
 
8aec0fe
 
 
 
c41146d
 
8aec0fe

import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
import torch
from pynvml import * # needs restart of IDE to install, from nvidia-ml-py3

# Get device
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Streamlit setup
st.title("Telco Chat Bot")
st.page_link("https://github.com/Ali-maatouk/Tele-LLMs", label="Tele-LLMs backend", icon="📱")
# Add text giving credit
col1, col2 = st.columns(2)
if 'conversation' not in st.session_state:
    st.session_state.conversation = []
user_input = st.text_input("You:", "") # user input

# Resource monitoring:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


# Model functions:
@st.cache_resource(show_spinner=False)
def load_model():
    """ Load model from Hugging face."""
    print_gpu_utilization()
    success_placeholder = st.empty()
    with st.spinner("Loading model... please wait"):
        #model_name = "AliMaatouk/TinyLlama-1.1B-Tele"  # Replace with the correct model name
        #model_name = "AliMaatouk/LLama-3-8B-Tele-it"
        model_name = "AliMaatouk/Gemma-2B-Tele"
        if str(DEVICE) == "cuda:0": # may not need this, need to test on CPU if device map is okay anyway
            tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
        else:
            tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto")
        model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE)

    success_placeholder.success("Model loaded successfully!", icon="🔥")
    time.sleep(2)
    success_placeholder.empty()
    return model, tokenizer

def generate_response(user_input):
    """ Query the model. """
    success_placeholder = st.empty()
    with st.spinner("Thinking..."):
        inputs = tokenizer(user_input, return_tensors="pt").to(DEVICE)
        #outputs = model.generate(**inputs, max_length=1000, pad_token_id=tokenizer.eos_token_id)
        outputs = model.generate(**inputs, max_new_tokens=750)
        print_gpu_utilization()
        generated_tokens = outputs[0, len(inputs['input_ids'][0]):]
    success_placeholder.success("Response generated!", icon="✅")
    time.sleep(2)
    success_placeholder.empty()
    text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return text

# RUNTIME EVENTS:

# Load model and tokenizer
model, tokenizer = load_model()

# Submit button to send the query
with col1:
    if st.button("send"):
        if user_input:
            st.session_state.conversation.append({"role": "user", "content": user_input})
            # Querying model
            # Add a loading spinner during model loading
            response = generate_response(user_input)
            # Display bot response
            st.session_state.conversation.append({"role": "bot", "content": response})

# Clear button to reset
with col2:
    if st.button("clear chat"):
        if user_input:
            st.session_state.conversation = []

# Display conversation history
for chat in st.session_state.conversation:
    if chat['role'] == 'user':
        st.write(f"You: {chat['content']}")
    else:
        st.write(f"Bot: {chat['content']}")