File size: 3,878 Bytes
01b468b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69a190d
 
01b468b
 
 
 
 
 
 
 
 
 
 
 
69a190d
 
 
 
 
 
 
 
 
 
01b468b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69a190d
 
1203b67
 
69a190d
 
01b468b
 
 
 
 
 
69a190d
 
 
 
01b468b
 
 
 
 
 
 
 
 
 
 
 
69a190d
01b468b
 
69a190d
01b468b
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os

import pandas as pd
import tiktoken
from langchain.text_splitter import MarkdownHeaderTextSplitter


def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


def drop_outlier_chunks(df: pd.DataFrame, max_tokens_by_chunk: int = 4500):
    # drops chunks with abnormally high token counts, usually they contain lots of links
    filtered_df = df[df.content.apply(num_tokens_from_string) < max_tokens_by_chunk]
    outliers_df = df[df.content.apply(num_tokens_from_string) >= max_tokens_by_chunk]
    print(f"Dropping {len(df) - len(filtered_df)} outlier chunks")
    print(f"Dropped outliers: {outliers_df.content.to_list()}")
    return filtered_df


def find_md_files(folder_path):
    """Recursively find .md files, extract content and use filename as title."""
    md_files = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".md"):
                file_path = os.path.join(root, file)
                title = os.path.splitext(file)[0]
                # Remove the trailing junk (the last word is some kind of hash)
                title = " ".join(title.split()[:-1])
                with open(file_path, "r", encoding="utf-8") as md_file:
                    content = md_file.read()
                md_files.append({"title": title, "content": content})

    return md_files


def split_string_by_max_words(input_string, max_words):
    words = input_string.split()
    return [" ".join(words[i : i + max_words]) for i in range(0, len(words), max_words)]


def get_title_link_from_md_title(md_title: str, title_link_data: dict):
    for data in title_link_data:
        title = data["title"]
        if md_title in title.replace(":", "").replace(".", " ").replace("? ", ""):
            return data["title"], data["link"]
    # default back to course link if not found...
    print("\nNot found: ", md_title)
    return md_title, "https://learn.activeloop.ai/courses/langchain/"


if __name__ == "__main__":
    folder_path = "/path/to/folder/with/md_content/"
    folder_path = "/Users/jeremypinto/Downloads/d22d1e98-345f-490d-870e-3b082938741c_Export-0a33c13f-6d42-4a94-8f23-7459e7b2c024"

    md_files = find_md_files(folder_path)

    headers_to_split_on = [
        ("#", "#"),
        ("##", "##"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on
    )

    chunks = []

    import json

    from tqdm import tqdm

    with open("title_link_langchaincourse.json", "r") as f:
        title_link_data = json.load(f)

    for md_file in tqdm(md_files):
        md_title = md_file["title"]
        md_raw_content = md_file["content"]
        md_header_splits = markdown_splitter.split_text(md_raw_content)

        title, link = get_title_link_from_md_title(
            md_title, title_link_data=title_link_data
        )

        for split in md_header_splits:
            # add the headers back to the content
            headers = "\n".join(
                [
                    k + " " + v
                    for k, v in zip(split.metadata.keys(), split.metadata.values())
                ]
            )

            substrings = split_string_by_max_words(split.page_content, max_words=600)
            for substring in substrings:
                chunk = {
                    "title": title,
                    "content": headers + "\n" + substring,
                    "source": "TAI Course",
                    "url": link,
                }
                chunks.append(chunk)

    df = pd.DataFrame(chunks)

    df = drop_outlier_chunks(df, max_tokens_by_chunk=2000)

    print(f"Exported {len(df)} chunks from {len(md_files)} articles.")
    df.to_csv("langchain_course.csv")