thebigoed commited on
Commit
c41146d
β€’
1 Parent(s): 143b53b

updated app

Browse files
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: gray
5
  colorTo: pink
6
  sdk: streamlit
7
  sdk_version: 1.38.0
8
- app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
 
5
  colorTo: pink
6
  sdk: streamlit
7
  sdk_version: 1.38.0
8
+ app_file: ./app/fine_tuning_app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
__init__.py ADDED
File without changes
data/__init__.py ADDED
File without changes
data/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (133 Bytes). View file
 
data/__pycache__/fine_tune_dataset.cpython-310.pyc ADDED
Binary file (1.13 kB). View file
 
data/fine_tune_dataset.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import torch
2
+ # from trl import SFTTrainer
3
+ from datasets import load_dataset
4
+ # from transformers import TrainingArguments, TextStreamer
5
+ from unsloth.chat_templates import get_chat_template
6
+ # from unsloth import FastLanguageModel, is_bfloat16_supported
7
+
8
+
9
+ def load_data(dataset, tokenizer, samples=None):
10
+ print("Loading finetuning dataset.")
11
+
12
+ # Base models don't have chat templates so we can choose any - ChatML is popular
13
+ tokenizer = get_chat_template(tokenizer,
14
+ mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
15
+ chat_template="chatml",
16
+ )
17
+
18
+ def apply_template(examples):
19
+ # Ensuring we parse the ShareGPT reformat datasets into the format we want
20
+ messages = examples["conversations"]
21
+ text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages]
22
+ return {"text": text}
23
+
24
+
25
+ if samples is not None:
26
+ # Reducing the training load by only training on a subset
27
+ dataset = load_dataset(dataset, split=f"train[:{int(samples)}]")
28
+ else:
29
+ dataset = load_dataset(dataset, split="train")
30
+
31
+ return dataset.map(apply_template, batched=True)
fine_tuning_app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ fine_tuning_app.py
2
+
3
+ Running a basic chatbot app that can compare base and fine-tuned models from Hugging face.
4
+
5
+ Note:
6
+ - run using streamlit run fine_tuning_app.py
7
+ - use free -h then sudo sysctl vm.drop_caches=2 to ensure I have cache space but this can mess up the venv
8
+ - may need to run huggingface-cli login in terminal to enable access to model
9
+ - Or: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/discussions/130 for above
10
+ - Hugging face can use up a lot of disc space - cd ~/.cache/huggingface/hub then rm -rf <subdir>
11
+
12
+ """
13
+
14
+ import streamlit as st
15
+ from transformers import AutoTokenizer, AutoModelForCausalLM
16
+ import transformers
17
+ import time
18
+ import torch
19
+ from pynvml import * # needs restart of IDE to install, from nvidia-ml-py3
20
+
21
+ # ---------------------------------------------------------------------------------------
22
+ # GENERAL SETUP:
23
+ # ---------------------------------------------------------------------------------------
24
+
25
+ DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
26
+ hf_token = ""
27
+ # model_name = "thebigoed/PreFineLlama-3.1-8B" # this works badly as it does not know chat structure
28
+ # model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit" # this is what we were fine tuning - also bad without chat instruct
29
+ # model_name = "Qwen/Qwen2.5-7B-Instruct" # working well now
30
+ # model_name = "meta-llama/Meta-Llama-3-8B-Instruct" # very effective. NB: if using fine grained access token, make sure it can access gated repos
31
+ st.title("Fine Tuning Testing")
32
+ col1, col2 = st.columns(2)
33
+ if 'conversation' not in st.session_state:
34
+ st.session_state.conversation = []
35
+ user_input = st.text_input("You:", "") # user input
36
+
37
+ def print_gpu_utilization():
38
+ # Used for basic resource monioring.
39
+ nvmlInit()
40
+ handle = nvmlDeviceGetHandleByIndex(0)
41
+ info = nvmlDeviceGetMemoryInfo(handle)
42
+ print(f"GPU memory occupied: {info.used//1024**2} MB.")
43
+
44
+ # ---------------------------------------------------------------------------------------
45
+ # MODEL SETUP:
46
+ # ---------------------------------------------------------------------------------------
47
+
48
+ @st.cache_resource(show_spinner=False)
49
+ def load_model():
50
+ """ Load model from Hugging face."""
51
+ print_gpu_utilization()
52
+ # see https://huggingface.co/mlabonne/FineLlama-3.1-8B for how to run
53
+ # https://huggingface.co/docs/transformers/main/en/chat_templating look into this to decide on how we do templating
54
+ success_placeholder = st.empty()
55
+ with st.spinner("Loading model... please wait"):
56
+ if str(DEVICE) == "cuda:0": # may not need this, need to test on CPU if device map is okay anyway
57
+ tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
58
+ else:
59
+ tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto")
60
+
61
+ model = AutoModelForCausalLM.from_pretrained(model_name,
62
+ torch_dtype="auto",
63
+ device_map="auto"
64
+ )
65
+
66
+ # Not using terminators at the moment
67
+ #terminator = tokenizer.eos_token if tokenizer.eos_token else "<|endoftext|>"
68
+
69
+ success_placeholder.success("Model loaded successfully!", icon="πŸ”₯")
70
+ time.sleep(2)
71
+ success_placeholder.empty()
72
+ print_gpu_utilization()
73
+ return model, tokenizer
74
+
75
+
76
+ def generate_response():
77
+ """ Query the model. """
78
+
79
+ success_placeholder = st.empty()
80
+ with st.spinner("Thinking..."):
81
+
82
+ # Tokenising the conversation
83
+ if tokenizer.chat_template:
84
+ text = tokenizer.apply_chat_template(st.session_state.conversation, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(DEVICE)
85
+ else: # base models do not have chat templates
86
+ print("Assuming base model.")
87
+ model_input = ""
88
+ for entry in st.session_state.conversation:
89
+ model_input += f"{entry['role']}: {entry['content']}\n"
90
+ text = tokenizer(model_input + "assistant: ", return_tensors="pt")["input_ids"].to(DEVICE)
91
+ outputs = model.generate(text,
92
+ max_new_tokens=512,
93
+ )
94
+ outputs = tokenizer.batch_decode(outputs[:,text.shape[1]:], skip_special_tokens=True)[0]
95
+ print_gpu_utilization()
96
+
97
+ success_placeholder.success("Response generated!", icon="βœ…")
98
+ time.sleep(2)
99
+ success_placeholder.empty()
100
+ return outputs
101
+
102
+ # ---------------------------------------------------------------------------------------
103
+ # RUNTIME EVENTS:
104
+ # ---------------------------------------------------------------------------------------
105
+
106
+ model, tokenizer = load_model()
107
+
108
+ # Submit button to send the query
109
+ with col1:
110
+ if st.button("send"):
111
+ if user_input:
112
+ st.session_state.conversation.append({"role": "user", "content": user_input})
113
+ st.session_state.conversation.append({"role": "assistant", "content": generate_response()})
114
+
115
+ # Clear button to reset
116
+ with col2:
117
+ if st.button("clear chat"):
118
+ if user_input:
119
+ st.session_state.conversation = []
120
+
121
+ # Display conversation history
122
+ for chat in st.session_state.conversation:
123
+ if chat['role'] == 'user':
124
+ st.write(f"You: {chat['content']}")
125
+ else:
126
+ st.write(f"Assistant: {chat['content']}")
models/__init__.py ADDED
File without changes
requirements.txt CHANGED
@@ -1,3 +1,11 @@
1
- streamlit
2
  transformers
3
- torch
 
 
 
 
 
 
 
 
 
 
 
1
  transformers
2
+ pytorch
3
+ unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git
4
+ xformers
5
+ trl
6
+ bitsandbytes
7
+ peft
8
+ accelerate
9
+ streamlit
10
+ nvidia-ml-py3
11
+ huggingface_hub[cli]
scripts/__init__.py ADDED
File without changes
scripts/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (136 Bytes). View file
 
scripts/__pycache__/finetune.cpython-310.pyc ADDED
Binary file (1.95 kB). View file
 
scripts/finetune.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # run as a module using: python3 -m scripts.finetune
2
+
3
+ # Using: https://huggingface.co/blog/mlabonne/sft-llama3
4
+
5
+ import torch
6
+ from trl import SFTTrainer
7
+ from datasets import load_dataset
8
+ from transformers import TrainingArguments, TextStreamer
9
+ from unsloth.chat_templates import get_chat_template
10
+ from unsloth import FastLanguageModel, is_bfloat16_supported
11
+
12
+ from data.fine_tune_dataset import load_data
13
+
14
+ def finetune(model="unsloth/Meta-Llama-3.1-8B-bnb-4bit", dataset="mlabonne/FineTome-100k"):
15
+
16
+ hf_token = ""
17
+
18
+ # Loading the model and restricting context window
19
+ max_seq_length = 2048
20
+ model, tokenizer = FastLanguageModel.from_pretrained(
21
+ model_name=model,
22
+ max_seq_length=max_seq_length,
23
+ load_in_4bit=True,
24
+ dtype=None,
25
+ )
26
+
27
+ # Loading prepared dataset
28
+ dataset = load_data(dataset, tokenizer)
29
+
30
+ # Loading the model for fine tuning - only set to FT 42million/8billion parameters
31
+ model = FastLanguageModel.get_peft_model(
32
+ model,
33
+ r=16, # rank determines LoRA (Low rank adaptation - freezing much of the model for fine tuning) matrix size, higher increases memory and compute cost
34
+ lora_alpha=16, # scaling factor for updates
35
+ lora_dropout=0, # not used for speedup
36
+ target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], # where LoRA targets
37
+ use_rslora=True, # rank stabilised
38
+ use_gradient_checkpointing="unsloth"
39
+ )
40
+
41
+ # Saving the untrained model, save_method can be lora to only save adapters or merged (16 or 4 bit)
42
+ model.save_pretrained_merged("models/PreFineLlama-3.1-8B", tokenizer, save_method="merged_16bit") # save to models directory locally
43
+ model.push_to_hub_merged("thebigoed/PreFineLlama-3.1-8B", tokenizer, token=hf_token, save_method="merged_16bit")
44
+
45
+ trainer=SFTTrainer(
46
+ model=model,
47
+ tokenizer=tokenizer,
48
+ train_dataset=dataset,
49
+ dataset_text_field="text",
50
+ max_seq_length=max_seq_length,
51
+ dataset_num_proc=2,
52
+ packing=True,
53
+ args=TrainingArguments(
54
+ learning_rate=3e-4, # to low = slow and local minima, too high = unstable
55
+ lr_scheduler_type="linear", # adjusts the learning rate (linear and cosine are most popular)
56
+ per_device_train_batch_size=8,
57
+ gradient_accumulation_steps=2,
58
+ num_train_epochs=1,
59
+ fp16=not is_bfloat16_supported(),
60
+ bf16=is_bfloat16_supported(),
61
+ logging_steps=1,
62
+ optim="adamw_8bit",
63
+ weight_decay=0.01,
64
+ warmup_steps=10,
65
+ output_dir="output",
66
+ seed=0,
67
+ ),
68
+ )
69
+
70
+ trainer.train()
71
+
72
+ # Saving the model, save_method can be lora to only save adapters or merged (16 or 4 bit)
73
+ model.save_pretrained_merged("models/FineLlama-3.1-8B", tokenizer, save_method="merged_16bit") # save to models directory locally
74
+ model.push_to_hub_merged("thebigoed/FineLlama-3.1-8B", tokenizer, token=hf_token, save_method="merged_16bit")
75
+
76
+ # Use to save in GGUF quantised format
77
+ # quant_methods = ["q2_k", "q3_k_m", "q4_k_m", "q5_k_m", "q6_k", "q8_0"]
78
+ # for quant in quant_methods:
79
+ # model.push_to_hub_gguf("", tokenizer, quant)
80
+
81
+ return
82
+
83
+ if __name__ == "__main__":
84
+ finetune()
85
+
86
+
app.py β†’ telco_app.py RENAMED
@@ -1,6 +1,11 @@
1
  import streamlit as st
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import time
 
 
 
 
 
4
 
5
  # Streamlit setup
6
  st.title("Telco Chat Bot")
@@ -11,16 +16,30 @@ if 'conversation' not in st.session_state:
11
  st.session_state.conversation = []
12
  user_input = st.text_input("You:", "") # user input
13
 
 
 
 
 
 
 
 
14
 
15
  # Model functions:
16
  @st.cache_resource(show_spinner=False)
17
  def load_model():
18
  """ Load model from Hugging face."""
 
19
  success_placeholder = st.empty()
20
  with st.spinner("Loading model... please wait"):
21
- model_name = "AliMaatouk/TinyLlama-1.1B-Tele" # Replace with the correct model name
22
- tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto")
23
- model = AutoModelForCausalLM.from_pretrained(model_name)
 
 
 
 
 
 
24
  success_placeholder.success("Model loaded successfully!", icon="πŸ”₯")
25
  time.sleep(2)
26
  success_placeholder.empty()
@@ -30,14 +49,16 @@ def generate_response(user_input):
30
  """ Query the model. """
31
  success_placeholder = st.empty()
32
  with st.spinner("Thinking..."):
33
- inputs = tokenizer(user_input, return_tensors="pt")
34
  #outputs = model.generate(**inputs, max_length=1000, pad_token_id=tokenizer.eos_token_id)
35
- outputs = model.generate(**inputs, max_new_tokens=100)
 
36
  generated_tokens = outputs[0, len(inputs['input_ids'][0]):]
37
  success_placeholder.success("Response generated!", icon="βœ…")
38
  time.sleep(2)
39
  success_placeholder.empty()
40
- return tokenizer.decode(generated_tokens, skip_special_tokens=True)
 
41
 
42
  # RUNTIME EVENTS:
43
 
 
1
  import streamlit as st
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import time
4
+ import torch
5
+ from pynvml import * # needs restart of IDE to install, from nvidia-ml-py3
6
+
7
+ # Get device
8
+ DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
9
 
10
  # Streamlit setup
11
  st.title("Telco Chat Bot")
 
16
  st.session_state.conversation = []
17
  user_input = st.text_input("You:", "") # user input
18
 
19
+ # Resource monitoring:
20
+ def print_gpu_utilization():
21
+ nvmlInit()
22
+ handle = nvmlDeviceGetHandleByIndex(0)
23
+ info = nvmlDeviceGetMemoryInfo(handle)
24
+ print(f"GPU memory occupied: {info.used//1024**2} MB.")
25
+
26
 
27
  # Model functions:
28
  @st.cache_resource(show_spinner=False)
29
  def load_model():
30
  """ Load model from Hugging face."""
31
+ print_gpu_utilization()
32
  success_placeholder = st.empty()
33
  with st.spinner("Loading model... please wait"):
34
+ #model_name = "AliMaatouk/TinyLlama-1.1B-Tele" # Replace with the correct model name
35
+ #model_name = "AliMaatouk/LLama-3-8B-Tele-it"
36
+ model_name = "AliMaatouk/Gemma-2B-Tele"
37
+ if str(DEVICE) == "cuda:0": # may not need this, need to test on CPU if device map is okay anyway
38
+ tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
39
+ else:
40
+ tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto")
41
+ model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE)
42
+
43
  success_placeholder.success("Model loaded successfully!", icon="πŸ”₯")
44
  time.sleep(2)
45
  success_placeholder.empty()
 
49
  """ Query the model. """
50
  success_placeholder = st.empty()
51
  with st.spinner("Thinking..."):
52
+ inputs = tokenizer(user_input, return_tensors="pt").to(DEVICE)
53
  #outputs = model.generate(**inputs, max_length=1000, pad_token_id=tokenizer.eos_token_id)
54
+ outputs = model.generate(**inputs, max_new_tokens=750)
55
+ print_gpu_utilization()
56
  generated_tokens = outputs[0, len(inputs['input_ids'][0]):]
57
  success_placeholder.success("Response generated!", icon="βœ…")
58
  time.sleep(2)
59
  success_placeholder.empty()
60
+ text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
61
+ return text
62
 
63
  # RUNTIME EVENTS:
64