from datasets import load_dataset DATA_PATH = "Account_Balance-main/Account Balance Queries/Account _Balance.json" def dash_line(): print("-"*100) def transform_raw_data(file_path: str): dash_line() print("Loading Data") if file_path.endswith(".json") or file_path.endswith(".jsonl"): data = load_dataset("json", data_files=file_path, split="train") dash_line() print("Transforming Data") keyword_AI = "[|AI|]" keyword_user = "[|User|]" formatted_data = [] for feature in data: row = {} row["topic"] = feature['topic'] row["conversation"] = [] conversation = feature['input'].split("\n") for entries in conversation: if keyword_user in entries: row["conversation"].append( { "role": "user", "content": entries.replace(keyword_user, "").strip() } ) elif keyword_AI in entries: row["conversation"].append( { "role": "AI", "content": entries.replace(keyword_AI, "").strip() } ) formatted_data.append(row) dash_line() print("Data transformation completed!") dash_line() return formatted_data if __name__ == "__main__": transform_raw_data(DATA_PATH)