buster / markdown_parser.py
jerpint's picture
Lint + black action (#11)
1203b67 unverified
raw
history blame
No virus
3.88 kB
import os
import pandas as pd
import tiktoken
from langchain.text_splitter import MarkdownHeaderTextSplitter
def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens
def drop_outlier_chunks(df: pd.DataFrame, max_tokens_by_chunk: int = 4500):
# drops chunks with abnormally high token counts, usually they contain lots of links
filtered_df = df[df.content.apply(num_tokens_from_string) < max_tokens_by_chunk]
outliers_df = df[df.content.apply(num_tokens_from_string) >= max_tokens_by_chunk]
print(f"Dropping {len(df) - len(filtered_df)} outlier chunks")
print(f"Dropped outliers: {outliers_df.content.to_list()}")
return filtered_df
def find_md_files(folder_path):
"""Recursively find .md files, extract content and use filename as title."""
md_files = []
for root, _, files in os.walk(folder_path):
for file in files:
if file.endswith(".md"):
file_path = os.path.join(root, file)
title = os.path.splitext(file)[0]
# Remove the trailing junk (the last word is some kind of hash)
title = " ".join(title.split()[:-1])
with open(file_path, "r", encoding="utf-8") as md_file:
content = md_file.read()
md_files.append({"title": title, "content": content})
return md_files
def split_string_by_max_words(input_string, max_words):
words = input_string.split()
return [" ".join(words[i : i + max_words]) for i in range(0, len(words), max_words)]
def get_title_link_from_md_title(md_title: str, title_link_data: dict):
for data in title_link_data:
title = data["title"]
if md_title in title.replace(":", "").replace(".", " ").replace("? ", ""):
return data["title"], data["link"]
# default back to course link if not found...
print("\nNot found: ", md_title)
return md_title, "https://learn.activeloop.ai/courses/langchain/"
if __name__ == "__main__":
folder_path = "/path/to/folder/with/md_content/"
folder_path = "/Users/jeremypinto/Downloads/d22d1e98-345f-490d-870e-3b082938741c_Export-0a33c13f-6d42-4a94-8f23-7459e7b2c024"
md_files = find_md_files(folder_path)
headers_to_split_on = [
("#", "#"),
("##", "##"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=headers_to_split_on
)
chunks = []
import json
from tqdm import tqdm
with open("title_link_langchaincourse.json", "r") as f:
title_link_data = json.load(f)
for md_file in tqdm(md_files):
md_title = md_file["title"]
md_raw_content = md_file["content"]
md_header_splits = markdown_splitter.split_text(md_raw_content)
title, link = get_title_link_from_md_title(
md_title, title_link_data=title_link_data
)
for split in md_header_splits:
# add the headers back to the content
headers = "\n".join(
[
k + " " + v
for k, v in zip(split.metadata.keys(), split.metadata.values())
]
)
substrings = split_string_by_max_words(split.page_content, max_words=600)
for substring in substrings:
chunk = {
"title": title,
"content": headers + "\n" + substring,
"source": "TAI Course",
"url": link,
}
chunks.append(chunk)
df = pd.DataFrame(chunks)
df = drop_outlier_chunks(df, max_tokens_by_chunk=2000)
print(f"Exported {len(df)} chunks from {len(md_files)} articles.")
df.to_csv("langchain_course.csv")