Spaces:

thebigoed
/

LLMTesting

Running

App Files Files Community

LLMTesting / fine_tuning_app.py

thebigoed

updated app

c41146d 29 days ago

raw

history blame

5.48 kB

	""" fine_tuning_app.py

	Running a basic chatbot app that can compare base and fine-tuned models from Hugging face.

	Note:
	- run using streamlit run fine_tuning_app.py
	- use free -h then sudo sysctl vm.drop_caches=2 to ensure I have cache space but this can mess up the venv
	- may need to run huggingface-cli login in terminal to enable access to model
	- Or: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/discussions/130 for above
	- Hugging face can use up a lot of disc space - cd ~/.cache/huggingface/hub then rm -rf <subdir>

	"""

	import streamlit as st
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import transformers
	import time
	import torch
	from pynvml import * # needs restart of IDE to install, from nvidia-ml-py3

	# ---------------------------------------------------------------------------------------
	# GENERAL SETUP:
	# ---------------------------------------------------------------------------------------

	DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	hf_token = ""
	# model_name = "thebigoed/PreFineLlama-3.1-8B" # this works badly as it does not know chat structure
	# model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit" # this is what we were fine tuning - also bad without chat instruct
	# model_name = "Qwen/Qwen2.5-7B-Instruct" # working well now
	# model_name = "meta-llama/Meta-Llama-3-8B-Instruct" # very effective. NB: if using fine grained access token, make sure it can access gated repos
	st.title("Fine Tuning Testing")
	col1, col2 = st.columns(2)
	if 'conversation' not in st.session_state:
	st.session_state.conversation = []
	user_input = st.text_input("You:", "") # user input

	def print_gpu_utilization():
	# Used for basic resource monioring.
	nvmlInit()
	handle = nvmlDeviceGetHandleByIndex(0)
	info = nvmlDeviceGetMemoryInfo(handle)
	print(f"GPU memory occupied: {info.used//1024**2} MB.")

	# ---------------------------------------------------------------------------------------
	# MODEL SETUP:
	# ---------------------------------------------------------------------------------------

	@st.cache_resource(show_spinner=False)
	def load_model():
	""" Load model from Hugging face."""
	print_gpu_utilization()
	# see https://huggingface.co/mlabonne/FineLlama-3.1-8B for how to run
	# https://huggingface.co/docs/transformers/main/en/chat_templating look into this to decide on how we do templating
	success_placeholder = st.empty()
	with st.spinner("Loading model... please wait"):
	if str(DEVICE) == "cuda:0": # may not need this, need to test on CPU if device map is okay anyway
	tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
	else:
	tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto")

	model = AutoModelForCausalLM.from_pretrained(model_name,
	torch_dtype="auto",
	device_map="auto"
	)

	# Not using terminators at the moment
	#terminator = tokenizer.eos_token if tokenizer.eos_token else "<\|endoftext\|>"

	success_placeholder.success("Model loaded successfully!", icon="🔥")
	time.sleep(2)
	success_placeholder.empty()
	print_gpu_utilization()
	return model, tokenizer


	def generate_response():
	""" Query the model. """

	success_placeholder = st.empty()
	with st.spinner("Thinking..."):

	# Tokenising the conversation
	if tokenizer.chat_template:
	text = tokenizer.apply_chat_template(st.session_state.conversation, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(DEVICE)
	else: # base models do not have chat templates
	print("Assuming base model.")
	model_input = ""
	for entry in st.session_state.conversation:
	model_input += f"{entry['role']}: {entry['content']}\n"
	text = tokenizer(model_input + "assistant: ", return_tensors="pt")["input_ids"].to(DEVICE)
	outputs = model.generate(text,
	max_new_tokens=512,
	)
	outputs = tokenizer.batch_decode(outputs[:,text.shape[1]:], skip_special_tokens=True)[0]
	print_gpu_utilization()

	success_placeholder.success("Response generated!", icon="✅")
	time.sleep(2)
	success_placeholder.empty()
	return outputs

	# ---------------------------------------------------------------------------------------
	# RUNTIME EVENTS:
	# ---------------------------------------------------------------------------------------

	model, tokenizer = load_model()

	# Submit button to send the query
	with col1:
	if st.button("send"):
	if user_input:
	st.session_state.conversation.append({"role": "user", "content": user_input})
	st.session_state.conversation.append({"role": "assistant", "content": generate_response()})

	# Clear button to reset
	with col2:
	if st.button("clear chat"):
	if user_input:
	st.session_state.conversation = []

	# Display conversation history
	for chat in st.session_state.conversation:
	if chat['role'] == 'user':
	st.write(f"You: {chat['content']}")
	else:
	st.write(f"Assistant: {chat['content']}")