Spaces:

thebigoed
/

LLMTesting

Running

App Files Files Community

thebigoed commited on 29 days ago

Commit

c41146d

•

1 Parent(s): 143b53b

updated app

Browse files

Files changed (14) hide show

README.md +1 -1
__init__.py +0 -0
data/__init__.py +0 -0
data/__pycache__/__init__.cpython-310.pyc +0 -0
data/__pycache__/fine_tune_dataset.cpython-310.pyc +0 -0
data/fine_tune_dataset.py +31 -0
fine_tuning_app.py +126 -0
models/__init__.py +0 -0
requirements.txt +10 -2
scripts/__init__.py +0 -0
scripts/__pycache__/__init__.cpython-310.pyc +0 -0
scripts/__pycache__/finetune.cpython-310.pyc +0 -0
scripts/finetune.py +86 -0
app.py → telco_app.py +27 -6

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: gray
 colorTo: pink
 sdk: streamlit
 sdk_version: 1.38.0
-app_file: app.py
 pinned: false
 license: apache-2.0
 ---

 colorTo: pink
 sdk: streamlit
 sdk_version: 1.38.0
+app_file: ./app/fine_tuning_app.py
 pinned: false
 license: apache-2.0
 ---

__init__.py ADDED Viewed

File without changes

data/__init__.py ADDED Viewed

File without changes

data/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (133 Bytes). View file

data/__pycache__/fine_tune_dataset.cpython-310.pyc ADDED Viewed

Binary file (1.13 kB). View file

data/fine_tune_dataset.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# import torch
+# from trl import SFTTrainer
+from datasets import load_dataset
+# from transformers import TrainingArguments, TextStreamer
+from unsloth.chat_templates import get_chat_template
+# from unsloth import FastLanguageModel, is_bfloat16_supported
+def load_data(dataset, tokenizer, samples=None):
+    print("Loading finetuning dataset.")
+    # Base models don't have chat templates so we can choose any - ChatML is popular
+    tokenizer = get_chat_template(tokenizer,
+        mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
+        chat_template="chatml",
+        )
+    def apply_template(examples):
+        # Ensuring we parse the ShareGPT reformat datasets into the format we want
+        messages = examples["conversations"]
+        text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages]
+        return {"text": text}
+    if samples is not None:
+        # Reducing the training load by only training on a subset
+        dataset = load_dataset(dataset, split=f"train[:{int(samples)}]")
+    else:
+        dataset = load_dataset(dataset, split="train")
+    return dataset.map(apply_template, batched=True)

fine_tuning_app.py ADDED Viewed

	@@ -0,0 +1,126 @@

+""" fine_tuning_app.py
+Running a basic chatbot app that can compare base and fine-tuned models from Hugging face.
+Note:
+ - run using streamlit run fine_tuning_app.py
+ - use free -h then sudo sysctl vm.drop_caches=2 to ensure I have cache space but this can mess up the venv
+ - may need to run huggingface-cli login in terminal to enable access to model
+ - Or: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/discussions/130 for above
+ - Hugging face can use up a lot of disc space - cd ~/.cache/huggingface/hub then rm -rf <subdir>
+"""
+import streamlit as st
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import transformers
+import time
+import torch
+from pynvml import * # needs restart of IDE to install, from nvidia-ml-py3
+# ---------------------------------------------------------------------------------------
+#                                     GENERAL SETUP:
+# ---------------------------------------------------------------------------------------
+DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+hf_token = ""
+# model_name = "thebigoed/PreFineLlama-3.1-8B" # this works badly as it does not know chat structure
+# model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit" # this is what we were fine tuning - also bad without chat instruct
+# model_name = "Qwen/Qwen2.5-7B-Instruct" # working well now
+# model_name = "meta-llama/Meta-Llama-3-8B-Instruct" # very effective. NB: if using fine grained access token, make sure it can access gated repos
+st.title("Fine Tuning Testing")
+col1, col2 = st.columns(2)
+if 'conversation' not in st.session_state:
+    st.session_state.conversation = []
+user_input = st.text_input("You:", "") # user input
+def print_gpu_utilization():
+    # Used for basic resource monioring.
+    nvmlInit()
+    handle = nvmlDeviceGetHandleByIndex(0)
+    info = nvmlDeviceGetMemoryInfo(handle)
+    print(f"GPU memory occupied: {info.used//1024**2} MB.")
+# ---------------------------------------------------------------------------------------
+#                                     MODEL SETUP:
+# ---------------------------------------------------------------------------------------
+@st.cache_resource(show_spinner=False)
+def load_model():
+    """ Load model from Hugging face."""
+    print_gpu_utilization()
+    # see https://huggingface.co/mlabonne/FineLlama-3.1-8B for how to run
+    # https://huggingface.co/docs/transformers/main/en/chat_templating look into this to decide on how we do templating
+    success_placeholder = st.empty()
+    with st.spinner("Loading model... please wait"):
+        if str(DEVICE) == "cuda:0": # may not need this, need to test on CPU if device map is okay anyway
+            tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto")
+        model = AutoModelForCausalLM.from_pretrained(model_name,
+                                                     torch_dtype="auto",
+                                                     device_map="auto"
+                                                    )
+        # Not using terminators at the moment
+        #terminator = tokenizer.eos_token if tokenizer.eos_token else "<|endoftext|>"
+    success_placeholder.success("Model loaded successfully!", icon="🔥")
+    time.sleep(2)
+    success_placeholder.empty()
+    print_gpu_utilization()
+    return model, tokenizer
+def generate_response():
+    """ Query the model. """
+    success_placeholder = st.empty()
+    with st.spinner("Thinking..."):
+        # Tokenising the conversation
+        if tokenizer.chat_template:
+            text = tokenizer.apply_chat_template(st.session_state.conversation, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(DEVICE)
+        else: # base models do not have chat templates
+            print("Assuming base model.")
+            model_input = ""
+            for entry in st.session_state.conversation:
+                model_input += f"{entry['role']}: {entry['content']}\n"
+            text = tokenizer(model_input + "assistant: ", return_tensors="pt")["input_ids"].to(DEVICE)
+        outputs = model.generate(text,
+                                max_new_tokens=512,
+                                )
+        outputs = tokenizer.batch_decode(outputs[:,text.shape[1]:], skip_special_tokens=True)[0]
+        print_gpu_utilization()
+    success_placeholder.success("Response generated!", icon="✅")
+    time.sleep(2)
+    success_placeholder.empty()
+    return outputs
+# ---------------------------------------------------------------------------------------
+#                                     RUNTIME EVENTS:
+# ---------------------------------------------------------------------------------------
+model, tokenizer = load_model()
+# Submit button to send the query
+with col1:
+    if st.button("send"):
+        if user_input:
+            st.session_state.conversation.append({"role": "user", "content": user_input})
+            st.session_state.conversation.append({"role": "assistant", "content": generate_response()})
+# Clear button to reset
+with col2:
+    if st.button("clear chat"):
+        if user_input:
+            st.session_state.conversation = []
+# Display conversation history
+for chat in st.session_state.conversation:
+    if chat['role'] == 'user':
+        st.write(f"You: {chat['content']}")
+    else:
+        st.write(f"Assistant: {chat['content']}")

models/__init__.py ADDED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -1,3 +1,11 @@
-streamlit
 transformers
-torch

 transformers
+pytorch
+unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git
+xformers
+trl
+bitsandbytes
+peft
+accelerate
+streamlit
+nvidia-ml-py3
+huggingface_hub[cli]

scripts/__init__.py ADDED Viewed

File without changes

scripts/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (136 Bytes). View file

scripts/__pycache__/finetune.cpython-310.pyc ADDED Viewed

Binary file (1.95 kB). View file

scripts/finetune.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# run as a module using: python3 -m scripts.finetune
+# Using: https://huggingface.co/blog/mlabonne/sft-llama3
+import torch
+from trl import SFTTrainer
+from datasets import load_dataset
+from transformers import TrainingArguments, TextStreamer
+from unsloth.chat_templates import get_chat_template
+from unsloth import FastLanguageModel, is_bfloat16_supported
+from data.fine_tune_dataset import load_data
+def finetune(model="unsloth/Meta-Llama-3.1-8B-bnb-4bit", dataset="mlabonne/FineTome-100k"):
+    hf_token = ""
+    # Loading the model and restricting context window
+    max_seq_length = 2048
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=model,
+        max_seq_length=max_seq_length,
+        load_in_4bit=True,
+        dtype=None,
+    )
+    # Loading prepared dataset
+    dataset = load_data(dataset, tokenizer)
+    # Loading the model for fine tuning - only set to FT 42million/8billion parameters
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=16, # rank determines LoRA (Low rank adaptation - freezing much of the model for fine tuning) matrix size, higher increases memory and compute cost
+        lora_alpha=16, # scaling factor for updates
+        lora_dropout=0, # not used for speedup
+        target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], # where LoRA targets
+        use_rslora=True, # rank stabilised
+        use_gradient_checkpointing="unsloth"
+    )
+    # Saving the untrained model, save_method can be lora to only save adapters or merged (16 or 4 bit)
+    model.save_pretrained_merged("models/PreFineLlama-3.1-8B", tokenizer, save_method="merged_16bit") # save to models directory locally
+    model.push_to_hub_merged("thebigoed/PreFineLlama-3.1-8B", tokenizer, token=hf_token, save_method="merged_16bit")
+    trainer=SFTTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=dataset,
+        dataset_text_field="text",
+        max_seq_length=max_seq_length,
+        dataset_num_proc=2,
+        packing=True,
+        args=TrainingArguments(
+            learning_rate=3e-4, # to low = slow and local minima, too high = unstable
+            lr_scheduler_type="linear", # adjusts the learning rate (linear and cosine are most popular)
+            per_device_train_batch_size=8,
+            gradient_accumulation_steps=2,
+            num_train_epochs=1,
+            fp16=not is_bfloat16_supported(),
+            bf16=is_bfloat16_supported(),
+            logging_steps=1,
+            optim="adamw_8bit",
+            weight_decay=0.01,
+            warmup_steps=10,
+            output_dir="output",
+            seed=0,
+        ),
+    )
+    trainer.train()
+    # Saving the model, save_method can be lora to only save adapters or merged (16 or 4 bit)
+    model.save_pretrained_merged("models/FineLlama-3.1-8B", tokenizer, save_method="merged_16bit") # save to models directory locally
+    model.push_to_hub_merged("thebigoed/FineLlama-3.1-8B", tokenizer, token=hf_token, save_method="merged_16bit")
+    # Use to save in GGUF quantised format
+    # quant_methods = ["q2_k", "q3_k_m", "q4_k_m", "q5_k_m", "q6_k", "q8_0"]
+    # for quant in quant_methods:
+    #     model.push_to_hub_gguf("", tokenizer, quant)
+    return
+if __name__ == "__main__":
+    finetune()

app.py → telco_app.py RENAMED Viewed

@@ -1,6 +1,11 @@
 import streamlit as st
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import time
 # Streamlit setup
 st.title("Telco Chat Bot")
@@ -11,16 +16,30 @@ if 'conversation' not in st.session_state:
     st.session_state.conversation = []
 user_input = st.text_input("You:", "") # user input
 # Model functions:
 @st.cache_resource(show_spinner=False)
 def load_model():
     """ Load model from Hugging face."""
     success_placeholder = st.empty()
     with st.spinner("Loading model... please wait"):
-        model_name = "AliMaatouk/TinyLlama-1.1B-Tele"  # Replace with the correct model name
-        tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto")
-        model = AutoModelForCausalLM.from_pretrained(model_name)
     success_placeholder.success("Model loaded successfully!", icon="🔥")
     time.sleep(2)
     success_placeholder.empty()
@@ -30,14 +49,16 @@ def generate_response(user_input):
     """ Query the model. """
     success_placeholder = st.empty()
     with st.spinner("Thinking..."):
-        inputs = tokenizer(user_input, return_tensors="pt")
         #outputs = model.generate(**inputs, max_length=1000, pad_token_id=tokenizer.eos_token_id)
-        outputs = model.generate(**inputs, max_new_tokens=100)
         generated_tokens = outputs[0, len(inputs['input_ids'][0]):]
     success_placeholder.success("Response generated!", icon="✅")
     time.sleep(2)
     success_placeholder.empty()
-    return tokenizer.decode(generated_tokens, skip_special_tokens=True)
 # RUNTIME EVENTS:

 import streamlit as st
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import time
+import torch
+from pynvml import * # needs restart of IDE to install, from nvidia-ml-py3
+# Get device
+DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Streamlit setup
 st.title("Telco Chat Bot")
     st.session_state.conversation = []
 user_input = st.text_input("You:", "") # user input
+# Resource monitoring:
+def print_gpu_utilization():
+    nvmlInit()
+    handle = nvmlDeviceGetHandleByIndex(0)
+    info = nvmlDeviceGetMemoryInfo(handle)
+    print(f"GPU memory occupied: {info.used//1024**2} MB.")
 # Model functions:
 @st.cache_resource(show_spinner=False)
 def load_model():
     """ Load model from Hugging face."""
+    print_gpu_utilization()
     success_placeholder = st.empty()
     with st.spinner("Loading model... please wait"):
+        #model_name = "AliMaatouk/TinyLlama-1.1B-Tele"  # Replace with the correct model name
+        #model_name = "AliMaatouk/LLama-3-8B-Tele-it"
+        model_name = "AliMaatouk/Gemma-2B-Tele"
+        if str(DEVICE) == "cuda:0": # may not need this, need to test on CPU if device map is okay anyway
+            tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto")
+        model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE)
     success_placeholder.success("Model loaded successfully!", icon="🔥")
     time.sleep(2)
     success_placeholder.empty()
     """ Query the model. """
     success_placeholder = st.empty()
     with st.spinner("Thinking..."):
+        inputs = tokenizer(user_input, return_tensors="pt").to(DEVICE)
         #outputs = model.generate(**inputs, max_length=1000, pad_token_id=tokenizer.eos_token_id)
+        outputs = model.generate(**inputs, max_new_tokens=750)
+        print_gpu_utilization()
         generated_tokens = outputs[0, len(inputs['input_ids'][0]):]
     success_placeholder.success("Response generated!", icon="✅")
     time.sleep(2)
     success_placeholder.empty()
+    text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+    return text
 # RUNTIME EVENTS: