import pandas as pd import re def parse_markdown_file(file_path): entries = [] with open(file_path, "r", encoding="utf-8") as file: current_url, current_title, current_content = "", "", "" inside_page = False for line in file: if line.strip() == "--": # Check for page separator if inside_page: # Process the previous page process_content( entries, current_url, current_title, current_content ) current_content = "" inside_page = True # Read URL and title current_url = next(file).strip().split(" ", 1)[1] current_title = ( next(file).strip().split(" ", 1)[1].replace("\n", " ") ) # Replace new lines in title # Skip the next two lines (description and keywords) next(file) next(file) # print(f"Detected Page: Title - {current_title}, URL - {current_url}") # Debugging elif inside_page: current_content += line if inside_page: process_content(entries, current_url, current_title, current_content) df = pd.DataFrame(entries) return df def process_content(entries, url, title, content): # Regular expression to match markdown headers header_pattern = re.compile(r"^## (.+)$", re.MULTILINE) # Split the content into sections based on headers sections = re.split(header_pattern, content) section_title = "Main" # Default section title for content before the first header # Initial content before the first header (if any) if not sections[0].startswith("##") and sections[0].strip(): add_content_section(entries, title, url, "Main", sections[0]) # Process each section for i in range(1, len(sections), 2): section_header = sections[i].strip() section_text = ( sections[i + 1].strip().replace("\n", " ") ) # Replace new lines in content add_content_section(entries, title, url, section_header, section_text) def add_content_section(entries, title, url, section_title, section_text): full_section = f"{section_title}: {section_text}".replace( "\n", " " ) # Replace new lines in content for j in range(0, len(full_section), 6000): entries.append( { "title": title, "url": url, "source": "langchain", "content": full_section[j : j + 6000], } ) markdown_file_path = "data/langchain_scrape.md" df = parse_markdown_file(markdown_file_path) print("Final DataFrame:") print(df.head()) # Print the first few rows for verification df.to_csv("data/langchain.csv", index=False)