File size: 2,880 Bytes
0f06abd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
import re


def parse_markdown_file(file_path):
    entries = []
    with open(file_path, "r", encoding="utf-8") as file:
        current_url, current_title, current_content = "", "", ""
        inside_page = False

        for line in file:
            if line.strip() == "--":  # Check for page separator
                if inside_page:
                    # Process the previous page
                    process_content(
                        entries, current_url, current_title, current_content
                    )
                    current_content = ""

                inside_page = True
                # Read URL and title
                current_url = next(file).strip().split(" ", 1)[1]
                current_title = (
                    next(file).strip().split(" ", 1)[1].replace("\n", " ")
                )  # Replace new lines in title
                # Skip the next two lines (description and keywords)
                next(file)
                next(file)
                # print(f"Detected Page: Title - {current_title}, URL - {current_url}")  # Debugging

            elif inside_page:
                current_content += line

        if inside_page:
            process_content(entries, current_url, current_title, current_content)

    df = pd.DataFrame(entries)
    return df


def process_content(entries, url, title, content):
    # Regular expression to match markdown headers
    header_pattern = re.compile(r"^## (.+)$", re.MULTILINE)

    # Split the content into sections based on headers
    sections = re.split(header_pattern, content)
    section_title = "Main"  # Default section title for content before the first header

    # Initial content before the first header (if any)
    if not sections[0].startswith("##") and sections[0].strip():
        add_content_section(entries, title, url, "Main", sections[0])

    # Process each section
    for i in range(1, len(sections), 2):
        section_header = sections[i].strip()
        section_text = (
            sections[i + 1].strip().replace("\n", " ")
        )  # Replace new lines in content

        add_content_section(entries, title, url, section_header, section_text)


def add_content_section(entries, title, url, section_title, section_text):
    full_section = f"{section_title}: {section_text}".replace(
        "\n", " "
    )  # Replace new lines in content
    for j in range(0, len(full_section), 6000):
        entries.append(
            {
                "title": title,
                "url": url,
                "source": "langchain",
                "content": full_section[j : j + 6000],
            }
        )


markdown_file_path = "data/langchain_scrape.md"
df = parse_markdown_file(markdown_file_path)
print("Final DataFrame:")
print(df.head())  # Print the first few rows for verification
df.to_csv("data/langchain.csv", index=False)