import os from random import randrange from datetime import datetime from dotenv import ( load_dotenv, find_dotenv, dotenv_values, set_key ) from datasets import load_dataset from transformers import AutoTokenizer # --------------------------------------------------- # # ----------------- Load env vars ------------------ # # --------------------------------------------------- # print("Loading env variables.........................") _ = load_dotenv(find_dotenv()) CHATBOT_NAME = os.environ.get('CHATBOT_NAME') ENV_FILE = os.environ.get("ENV_FILE") MODEL_NAME = os.environ.get("MODEL_NAME") # ----------------------------------------------- # # ----------------- Load data ------------------ # # ----------------------------------------------- # print("Loading data.........................") DATA_PATH = 'data/raw_data.json' if DATA_PATH.endswith(".json") or DATA_PATH.endswith(".jsonl"): dataset = load_dataset("json", data_files=DATA_PATH) # ----------------------------------------------------- # # ----------------- Preprocess data ------------------ # # ----------------------------------------------------- # print("Cleaning data.........................") # ----------------- TO DO ------------------ # # For the prompt, implement the same using # # lancgchain.prompt systemprompt, humanprompt # # ------------------------------------------ # # ----------------- TO DO ------------------ # # For the conversational data, adjust the # # prompt to have a chain of user-ai query. # # ------------------------------------------ # # this function is used to output the right formate for each row in the dataset tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path=MODEL_NAME, use_fast = True ) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = 'right' def create_text_row(data): user, response, input = data['user'], data['response'], data['input'] chat_prompt = [ { "role": "user", "content": user }, { "role": "assistant", "content": response } ] data["text"] = tokenizer.apply_chat_template( chat_prompt, tokenize=False, add_generation_prompt=True ) return data # idx = randrange(len(dataset['train'])) train_dataset = dataset['train'].shuffle(seed=2023).map(create_text_row) # ----------------------------------------------------------- # # ----------------- Saving processed data ------------------ # # ----------------------------------------------------------- # print("Saving processed data.........................") now = datetime.now().strftime(format='%Y-%m-%d-%Hh%Mm') data_cleaned_path = f"data/cleaned/{now}_data_cleaned.jsonl" train_dataset.to_json(data_cleaned_path) print(f"\tProcessed data saved to '{data_cleaned_path}'") # ---------------------------------------------------------- # # ----------------- Setting new env vars ------------------ # # ---------------------------------------------------------- # existing_env_vals = dotenv_values(ENV_FILE) set_key(ENV_FILE, "CLEANED_DATA_PATH", data_cleaned_path) existing_env_vals print("Data processing completed")