buster / data /scrapper_to_csv.py
Louis-François Bouchard
Advanced rag course update (#44)
0f06abd unverified
raw
history blame
No virus
2.88 kB
import pandas as pd
import re
def parse_markdown_file(file_path):
entries = []
with open(file_path, "r", encoding="utf-8") as file:
current_url, current_title, current_content = "", "", ""
inside_page = False
for line in file:
if line.strip() == "--": # Check for page separator
if inside_page:
# Process the previous page
process_content(
entries, current_url, current_title, current_content
)
current_content = ""
inside_page = True
# Read URL and title
current_url = next(file).strip().split(" ", 1)[1]
current_title = (
next(file).strip().split(" ", 1)[1].replace("\n", " ")
) # Replace new lines in title
# Skip the next two lines (description and keywords)
next(file)
next(file)
# print(f"Detected Page: Title - {current_title}, URL - {current_url}") # Debugging
elif inside_page:
current_content += line
if inside_page:
process_content(entries, current_url, current_title, current_content)
df = pd.DataFrame(entries)
return df
def process_content(entries, url, title, content):
# Regular expression to match markdown headers
header_pattern = re.compile(r"^## (.+)$", re.MULTILINE)
# Split the content into sections based on headers
sections = re.split(header_pattern, content)
section_title = "Main" # Default section title for content before the first header
# Initial content before the first header (if any)
if not sections[0].startswith("##") and sections[0].strip():
add_content_section(entries, title, url, "Main", sections[0])
# Process each section
for i in range(1, len(sections), 2):
section_header = sections[i].strip()
section_text = (
sections[i + 1].strip().replace("\n", " ")
) # Replace new lines in content
add_content_section(entries, title, url, section_header, section_text)
def add_content_section(entries, title, url, section_title, section_text):
full_section = f"{section_title}: {section_text}".replace(
"\n", " "
) # Replace new lines in content
for j in range(0, len(full_section), 6000):
entries.append(
{
"title": title,
"url": url,
"source": "langchain",
"content": full_section[j : j + 6000],
}
)
markdown_file_path = "data/langchain_scrape.md"
df = parse_markdown_file(markdown_file_path)
print("Final DataFrame:")
print(df.head()) # Print the first few rows for verification
df.to_csv("data/langchain.csv", index=False)