Spaces:

towardsai-tutors
/

buster

Running

buster / data /preprocess_chunks.py

add preprocess file (#2)

fbf9436 unverified about 1 year ago

No virus

1.41 kB

	import pandas as pd


	def combine(x):
	x = x.dropna(subset="content")
	return pd.DataFrame(
	{
	"content": " ".join(x.content.to_list()),
	"url": x.source.unique()[0],
	"source": "towardsai_blog",
	"title": x.title.unique()[0],
	},
	index=[0],
	)


	# recombine the chunks
	filename = "output.csv"
	df = pd.read_csv(filename)
	df_combined = df.groupby("ID").apply(func=combine)
	df_combined = df_combined.reset_index()

	df_combined = df_combined.drop(columns=["level_1"])
	df_combined.to_csv("chunks_preprocessed_combined.csv", index=False)

	# Naive splitting the content into multiple rows based on word count
	MAX_WORDS = 500
	new_rows = []
	for index, row in df_combined.iterrows():
	content = row["content"].split()
	num_chunks = (
	len(content) - 1
	) // MAX_WORDS + 1 # Number of chunks based on MAX_WORDS

	for i in range(num_chunks):
	start_idx = i * MAX_WORDS
	end_idx = (i + 1) * MAX_WORDS
	new_content = " ".join(content[start_idx:end_idx])
	new_row = row.copy()
	new_row["content"] = new_content
	new_rows.append(new_row)

	# Creating a new DataFrame with the split rows
	new_df = pd.DataFrame(new_rows)
	new_df = new_df.reset_index()

	# Drop a bunch of leftover useless columns
	new_df = new_df.drop(columns=["index"])

	new_df.to_csv("chunks_preprocessed.csv", index=False)