File size: 1,410 Bytes
fbf9436
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd


def combine(x):
    x = x.dropna(subset="content")
    return pd.DataFrame(
        {
            "content": " ".join(x.content.to_list()),
            "url": x.source.unique()[0],
            "source": "towardsai_blog",
            "title": x.title.unique()[0],
        },
        index=[0],
    )


# recombine the chunks
filename = "output.csv"
df = pd.read_csv(filename)
df_combined = df.groupby("ID").apply(func=combine)
df_combined = df_combined.reset_index()

df_combined = df_combined.drop(columns=["level_1"])
df_combined.to_csv("chunks_preprocessed_combined.csv", index=False)

# Naive splitting the content into multiple rows based on word count
MAX_WORDS = 500
new_rows = []
for index, row in df_combined.iterrows():
    content = row["content"].split()
    num_chunks = (
        len(content) - 1
    ) // MAX_WORDS + 1  # Number of chunks based on MAX_WORDS

    for i in range(num_chunks):
        start_idx = i * MAX_WORDS
        end_idx = (i + 1) * MAX_WORDS
        new_content = " ".join(content[start_idx:end_idx])
        new_row = row.copy()
        new_row["content"] = new_content
        new_rows.append(new_row)

# Creating a new DataFrame with the split rows
new_df = pd.DataFrame(new_rows)
new_df = new_df.reset_index()

# Drop a bunch of leftover useless columns
new_df = new_df.drop(columns=["index"])

new_df.to_csv("chunks_preprocessed.csv", index=False)