{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","trusted":true},"outputs":[],"source":["from huggingface_hub import hf_hub_download\n","import pandas as pd\n","\n","REPO_ID = \"Respair/sharegpt_chatml_compressed\" \n","FILENAME = \"data/train-00000-of-00003-a1a2fbd77d72a58e.parquet\"\n","\n","df1 = pd.read_parquet(\n"," hf_hub_download(repo_id=REPO_ID, filename=FILENAME, repo_type=\"dataset\")\n",")\n","df1.head()"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["from huggingface_hub import hf_hub_download\n","import pandas as pd\n","\n","REPO_ID = \"Respair/sharegpt_chatml_compressed\" \n","FILENAME = \"data/train-00001-of-00003-b34be86ae61c30fe.parquet\"\n","\n","df2 = pd.read_parquet(\n"," hf_hub_download(repo_id=REPO_ID, filename=FILENAME, repo_type=\"dataset\")\n",")\n","df2.head()"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["import pandas as pd\n","\n","def process_text(text):\n"," if 'Assistant:' in text:\n"," text = text.split(\"Assistant:\", 1)[1].strip()\n"," return text.rstrip('\\'\" ').strip()\n","\n","df1_processed = pd.DataFrame({\n"," 'original': df1['processed_messages'].apply(process_text),\n"," 'compressed': df1['round'].apply(process_text)\n","})\n","\n","df2_processed = pd.DataFrame({\n"," 'original': df2['processed_messages'].apply(process_text),\n"," 'compressed': df2['round'].apply(process_text)\n","})\n","\n","df3_processed = pd.DataFrame({\n"," 'original': df3['processed_messages'].apply(process_text),\n"," 'compressed': df3['round'].apply(process_text)\n","})\n","\n","final_df = pd.concat([df1_processed, df2_processed, df3_processed], ignore_index=True)\n","\n","final_df"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["from huggingface_hub import hf_hub_download\n","import pandas as pd\n","\n","REPO_ID = \"diwank/llmlingua-compressed-text\" \n","FILENAME = \"data/test-00000-of-00001.parquet\"\n","\n","df4 = pd.read_parquet(\n"," hf_hub_download(repo_id=REPO_ID, filename=FILENAME, repo_type=\"dataset\")\n",")\n","df4.head()"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["from huggingface_hub import hf_hub_download\n","import pandas as pd\n","\n","REPO_ID = \"diwank/llmlingua-compressed-text\" \n","FILENAME = \"data/train-00000-of-00001.parquet\"\n","\n","df5 = pd.read_parquet(\n"," hf_hub_download(repo_id=REPO_ID, filename=FILENAME, repo_type=\"dataset\")\n",")\n","df5.head()"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["import pandas as pd\n","\n","def process_text(text):\n"," if '#' in text:\n"," text = text.split(\"#\", 1)[1].strip() # Get the part after 'Assistant:'\n"," return text.rstrip('\\'\" ').strip() # Remove trailing single/double quotes, spaces\n","\n","# Apply the function to the processed_messages and round columns of each DataFrame\n","df4_processed = pd.DataFrame({\n"," 'original': df4['original'].apply(process_text),\n"," 'compressed': df4['compressed'].apply(process_text)\n","})\n","\n","df5_processed = pd.DataFrame({\n"," 'original': df5['original'].apply(process_text),\n"," 'compressed': df5['compressed'].apply(process_text)\n","})\n","\n","# Concatenate the three processed DataFrames into one\n","final_df2 = pd.concat([df4_processed, df5_processed], ignore_index=True)\n","\n","final_df2"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["saved_df = pd.concat([df1_processed, df2_processed, df3_processed, df4_processed, df5_processed], ignore_index=True)\n","\n","# Save to CSV\n","saved_df.to_csv('processed_data.csv', index=False)\n","\n","# Save to Parquet\n","saved_df.to_parquet('processed_data.parquet', index=False)\n","\n","print(\"DataFrames processed and saved successfully!\")\n","\n","saved_df"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["from huggingface_hub import hf_hub_download\n","import pandas as pd\n","\n","REPO_ID = \"AlexMaclean/wikipedia-deletion-compressions\" \n","FILENAME = \"wikipedia_compressions.jsonl\"\n","\n","df6 = pd.read_json(\n"," hf_hub_download(repo_id=REPO_ID, filename=FILENAME, repo_type=\"dataset\"), lines=True)\n","df6.head()"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["from huggingface_hub import hf_hub_download\n","import pandas as pd\n","\n","REPO_ID = \"AlexMaclean/all-deletion-compressions\" \n","FILENAME = \"google_compressions.jsonl\"\n","\n","df7 = pd.read_json(\n"," hf_hub_download(repo_id=REPO_ID, filename=FILENAME, repo_type=\"dataset\"), lines=True)\n","df7.head()"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["df6_processed = pd.DataFrame({\n"," 'original': df6['before'].apply(process_text),\n"," 'compressed': df6['after'].apply(process_text)\n","})\n","\n","df7_processed = pd.DataFrame({\n"," 'original': df7['before'].apply(process_text),\n"," 'compressed': df7['after'].apply(process_text)\n","})\n","\n","saved_df = pd.concat([df1_processed, df2_processed, df3_processed, df4_processed, df5_processed, df6_processed, df7_processed], ignore_index=True)\n","\n","# Save to CSV\n","saved_df.to_csv('processed_data.csv', index=False)\n","\n","# Save to Parquet\n","saved_df.to_parquet('processed_data.parquet', index=False)\n","\n","print(\"DataFrames processed and saved successfully!\")\n","\n","saved_df"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["from huggingface_hub import hf_hub_download\n","import pandas as pd\n","\n","REPO_ID = \"sentence-transformers/sentence-compression\" \n","FILENAME = \"pair/train-00000-of-00001.parquet\"\n","\n","df8 = pd.read_parquet(\n"," hf_hub_download(repo_id=REPO_ID, filename=FILENAME, repo_type=\"dataset\"))\n","df8.head()"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["df8_processed = pd.DataFrame({\n"," 'original': df8['text'].apply(process_text),\n"," 'compressed': df8['simplified'].apply(process_text)\n","})\n","\n","saved_df = pd.concat([df1_processed, df2_processed, df3_processed, df4_processed, df5_processed, df6_processed, df7_processed, df8_processed], ignore_index=True)\n","\n","# Save to CSV\n","saved_df.to_csv('processed_data.csv', index=False)\n","\n","# Save to Parquet\n","saved_df.to_parquet('processed_data.parquet', index=False)\n","\n","print(\"DataFrames processed and saved successfully!\")\n","\n","saved_df"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["!huggingface-cli login --token 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"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["from huggingface_hub import HfApi\n","\n","# Specify the file path and dataset repository name\n","file_path = \"/kaggle/working/processed_data.parquet\" # Local path to your file\n","repo_id = \"aoxo/token_compressor\" # Hugging Face dataset repo\n","\n","# Upload the Parquet file to the Hugging Face dataset repository\n","api = HfApi()\n","api.upload_file(\n"," path_or_fileobj=file_path,\n"," path_in_repo=\"processed_data.parquet\", # This is the path in the Hugging Face repo\n"," repo_id=repo_id,\n"," repo_type=\"dataset\"\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["!pip install accelerate peft bitsandbytes transformers trl\n","import torch\n","from datasets import load_dataset, Dataset\n","from peft import LoraConfig, AutoPeftModelForCausalLM\n","from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments\n","from trl import SFTTrainer\n","import os"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["model_id=\"unsloth/Meta-Llama-3.1-8B-bnb-4bit\""]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["def get_model_and_tokenizer(model_id):\n"," tokenizer = AutoTokenizer.from_pretrained(model_id)\n"," tokenizer.pad_token = tokenizer.eos_token\n"," bnb_config = BitsAndBytesConfig(\n"," load_in_4bit=True, bnb_4bit_quant_type=\"nf4\", bnb_4bit_compute_dtype=\"float16\", bnb_4bit_use_double_quant=True\n"," )\n"," model = AutoModelForCausalLM.from_pretrained(\n"," model_id, quantization_config=bnb_config, device_map=\"auto\"\n"," )\n"," model.config.use_cache=False\n"," model.config.pretraining_tp=1\n"," return model, tokenizer"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["model, tokenizer = get_model_and_tokenizer(model_id)"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["from transformers import GenerationConfig\n","from time import perf_counter\n","def generate_response(user_input):\n"," prompt = formatted_prompt(user_input)\n"," inputs = tokenizer([prompt], return_tensors=\"pt\")\n"," generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,\n"," top_k=5,temperature=0.5,repetition_penalty=1.2,\n"," max_new_tokens=60,pad_token_id=tokenizer.eos_token_id\n"," )\n"," start_time = perf_counter()\n"," inputs = tokenizer(prompt, return_tensors=\"pt\").to('cuda')\n"," outputs = model.generate(**inputs, generation_config=generation_config)\n"," theresponse = (tokenizer.decode(outputs[0], skip_special_tokens=True))\n"," print(tokenizer.decode(outputs[0], skip_special_tokens=True))\n"," output_time = perf_counter() - start_time\n"," print(f\"Time taken for inference: {round(output_time,2)} seconds\")"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["def formatted_prompt(question)-> str:\n"," return f\"<|im_start|>user\\n{question}<|im_end|>\\n<|im_start|>assistant:\""]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["output_model=\"llama-token-compressor\""]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["def formatted_train(input,response)->str:\n"," return f\"<|im_start|>user\\n{input}<|im_end|>\\n<|im_start|>assistant\\n{response}<|im_end|>\\n\""]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["import pandas as pd\n","def prepare_train_datav2(data):\n"," # Convert the data to a Pandas DataFrame\n"," data_df = pd.read_parquet(data)\n"," # Create a new column called \"text\"\n"," data_df[\"text\"] = data_df[[\"original\", \"compressed\"]].apply(lambda x: \"<|im_start|>user\\n\" + x[\"original\"] + \" <|im_end|>\\n<|im_start|>assistant\\n\" + x[\"compressed\"] + \"<|im_end|>\\n\", axis=1)\n"," # Create a new Dataset from the DataFrame\n"," data = Dataset.from_pandas(data_df)\n"," return data"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["from huggingface_hub import hf_hub_download\n","\n","REPO_ID = \"aoxo/token_compressor\" \n","FILENAME = \"processed_data.parquet\"\n","\n","df = pd.read_parquet(\n"," hf_hub_download(repo_id=REPO_ID, filename=FILENAME, repo_type=\"dataset\")\n",")\n","\n","df.to_parquet('processed_data.parquet', index=False)\n","\n","data = prepare_train_datav2('/kaggle/working/processed_data.parquet')"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["peft_config = LoraConfig(\n"," init_lora_weights=\"gaussian\", r=8, lora_alpha=16, lora_dropout=0.05, bias=\"none\", task_type=\"CAUSAL_LM\"\n"," )"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["import re\n","pattern = r'\\((\\w+)\\): Linear'\n","linear_layers = re.findall(pattern, str(model.modules))\n","target_modules = list(set(linear_layers))\n","print(target_modules)"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["from huggingface_hub import notebook_login\n","notebook_login()"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["training_arguments = TrainingArguments(\n"," output_dir=output_model,\n"," per_device_train_batch_size=4,\n"," gradient_accumulation_steps=16,\n"," optim=\"paged_adamw_8bit\",\n"," learning_rate=2e-4,\n"," lr_scheduler_type=\"cosine\",\n"," save_strategy=\"epoch\",\n"," logging_steps=10,\n"," num_train_epochs=10,\n"," max_steps=175118,\n"," bf16=True,\n"," push_to_hub=True\n"," )"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["trainer = SFTTrainer(\n"," model=model,\n"," train_dataset=data,\n"," peft_config=peft_config,\n"," dataset_text_field=\"original\",\n"," args=training_arguments,\n"," tokenizer=tokenizer,\n"," packing=False,\n"," max_seq_length=512\n"," )\n","\n","trainer.train()"]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["!zip -r token_compressor.zip /kaggle/working/llama-token-compressor"]}],"metadata":{"kaggle":{"accelerator":"gpu","dataSources":[],"dockerImageVersionId":30761,"isGpuEnabled":true,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.14"}},"nbformat":4,"nbformat_minor":4}