""" fine_tuning_app.py

Running a basic chatbot app that can compare base and fine-tuned models from Hugging face.

Note:
 - run using streamlit run fine_tuning_app.py
 - use free -h then sudo sysctl vm.drop_caches=2 to ensure I have cache space but this can mess up the venv
 - may need to run huggingface-cli login in terminal to enable access to model
 - Or: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/discussions/130 for above
 - Hugging face can use up a lot of disc space - cd ~/.cache/huggingface/hub then rm -rf <subdir>

"""

import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import time
import torch
from pynvml import * # needs restart of IDE to install, from nvidia-ml-py3

# ---------------------------------------------------------------------------------------
#                                     GENERAL SETUP:
# ---------------------------------------------------------------------------------------

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
hf_token = ""
# model_name = "thebigoed/PreFineLlama-3.1-8B" # this works badly as it does not know chat structure
# model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit" # this is what we were fine tuning - also bad without chat instruct
# model_name = "Qwen/Qwen2.5-7B-Instruct" # working well now
# model_name = "meta-llama/Meta-Llama-3-8B-Instruct" # very effective. NB: if using fine grained access token, make sure it can access gated repos
st.title("Fine Tuning Testing")
col1, col2 = st.columns(2)
if 'conversation' not in st.session_state:
    st.session_state.conversation = []
user_input = st.text_input("You:", "") # user input

def print_gpu_utilization():
    # Used for basic resource monioring.
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

# ---------------------------------------------------------------------------------------
#                                     MODEL SETUP:
# ---------------------------------------------------------------------------------------

@st.cache_resource(show_spinner=False)
def load_model():
    """ Load model from Hugging face."""
    print_gpu_utilization()
    # see https://huggingface.co/mlabonne/FineLlama-3.1-8B for how to run
    # https://huggingface.co/docs/transformers/main/en/chat_templating look into this to decide on how we do templating
    success_placeholder = st.empty()
    with st.spinner("Loading model... please wait"):
        if str(DEVICE) == "cuda:0": # may not need this, need to test on CPU if device map is okay anyway
            tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
        else:
            tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto")

        model = AutoModelForCausalLM.from_pretrained(model_name,
                                                     torch_dtype="auto",
                                                     device_map="auto"
                                                    )

        # Not using terminators at the moment
        #terminator = tokenizer.eos_token if tokenizer.eos_token else "<|endoftext|>"

    success_placeholder.success("Model loaded successfully!", icon="🔥")
    time.sleep(2)
    success_placeholder.empty()
    print_gpu_utilization()
    return model, tokenizer


def generate_response():
    """ Query the model. """

    success_placeholder = st.empty()
    with st.spinner("Thinking..."):

        # Tokenising the conversation
        if tokenizer.chat_template:
            text = tokenizer.apply_chat_template(st.session_state.conversation, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(DEVICE)
        else: # base models do not have chat templates
            print("Assuming base model.")
            model_input = ""
            for entry in st.session_state.conversation:
                model_input += f"{entry['role']}: {entry['content']}\n"
            text = tokenizer(model_input + "assistant: ", return_tensors="pt")["input_ids"].to(DEVICE)
        outputs = model.generate(text,
                                max_new_tokens=512,
                                )
        outputs = tokenizer.batch_decode(outputs[:,text.shape[1]:], skip_special_tokens=True)[0]
        print_gpu_utilization()
    
    success_placeholder.success("Response generated!", icon="✅")
    time.sleep(2)
    success_placeholder.empty()
    return outputs

# ---------------------------------------------------------------------------------------
#                                     RUNTIME EVENTS:
# ---------------------------------------------------------------------------------------

model, tokenizer = load_model()

# Submit button to send the query
with col1:
    if st.button("send"):
        if user_input:
            st.session_state.conversation.append({"role": "user", "content": user_input})
            st.session_state.conversation.append({"role": "assistant", "content": generate_response()})

# Clear button to reset
with col2:
    if st.button("clear chat"):
        if user_input:
            st.session_state.conversation = []

# Display conversation history
for chat in st.session_state.conversation:
    if chat['role'] == 'user':
        st.write(f"You: {chat['content']}")
    else:
        st.write(f"Assistant: {chat['content']}")