xtrade_bot / archived /data_transform.py
Josh-Ola's picture
Upload folder using huggingface_hub
65976bc verified
raw
history blame contribute delete
No virus
1.44 kB
from datasets import load_dataset
DATA_PATH = "Account_Balance-main/Account Balance Queries/Account _Balance.json"
def dash_line():
print("-"*100)
def transform_raw_data(file_path: str):
dash_line()
print("Loading Data")
if file_path.endswith(".json") or file_path.endswith(".jsonl"):
data = load_dataset("json", data_files=file_path, split="train")
dash_line()
print("Transforming Data")
keyword_AI = "[|AI|]"
keyword_user = "[|User|]"
formatted_data = []
for feature in data:
row = {}
row["topic"] = feature['topic']
row["conversation"] = []
conversation = feature['input'].split("\n")
for entries in conversation:
if keyword_user in entries:
row["conversation"].append(
{
"role": "user",
"content": entries.replace(keyword_user, "").strip()
}
)
elif keyword_AI in entries:
row["conversation"].append(
{
"role": "AI",
"content": entries.replace(keyword_AI, "").strip()
}
)
formatted_data.append(row)
dash_line()
print("Data transformation completed!")
dash_line()
return formatted_data
if __name__ == "__main__":
transform_raw_data(DATA_PATH)