Spaces:

towardsai-tutors
/

buster

Running

App Files Files Community

buster / markdown_parser.py

jerpint

Lint + black action (#11)

1203b67 unverified 12 months ago

raw

history blame

No virus

3.88 kB

	import os

	import pandas as pd
	import tiktoken
	from langchain.text_splitter import MarkdownHeaderTextSplitter


	def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
	encoding = tiktoken.get_encoding(encoding_name)
	num_tokens = len(encoding.encode(string))
	return num_tokens


	def drop_outlier_chunks(df: pd.DataFrame, max_tokens_by_chunk: int = 4500):
	# drops chunks with abnormally high token counts, usually they contain lots of links
	filtered_df = df[df.content.apply(num_tokens_from_string) < max_tokens_by_chunk]
	outliers_df = df[df.content.apply(num_tokens_from_string) >= max_tokens_by_chunk]
	print(f"Dropping {len(df) - len(filtered_df)} outlier chunks")
	print(f"Dropped outliers: {outliers_df.content.to_list()}")
	return filtered_df


	def find_md_files(folder_path):
	"""Recursively find .md files, extract content and use filename as title."""
	md_files = []

	for root, _, files in os.walk(folder_path):
	for file in files:
	if file.endswith(".md"):
	file_path = os.path.join(root, file)
	title = os.path.splitext(file)[0]
	# Remove the trailing junk (the last word is some kind of hash)
	title = " ".join(title.split()[:-1])
	with open(file_path, "r", encoding="utf-8") as md_file:
	content = md_file.read()
	md_files.append({"title": title, "content": content})

	return md_files


	def split_string_by_max_words(input_string, max_words):
	words = input_string.split()
	return [" ".join(words[i : i + max_words]) for i in range(0, len(words), max_words)]


	def get_title_link_from_md_title(md_title: str, title_link_data: dict):
	for data in title_link_data:
	title = data["title"]
	if md_title in title.replace(":", "").replace(".", " ").replace("? ", ""):
	return data["title"], data["link"]
	# default back to course link if not found...
	print("\nNot found: ", md_title)
	return md_title, "https://learn.activeloop.ai/courses/langchain/"


	if __name__ == "__main__":
	folder_path = "/path/to/folder/with/md_content/"
	folder_path = "/Users/jeremypinto/Downloads/d22d1e98-345f-490d-870e-3b082938741c_Export-0a33c13f-6d42-4a94-8f23-7459e7b2c024"

	md_files = find_md_files(folder_path)

	headers_to_split_on = [
	("#", "#"),
	("##", "##"),
	]

	markdown_splitter = MarkdownHeaderTextSplitter(
	headers_to_split_on=headers_to_split_on
	)

	chunks = []

	import json

	from tqdm import tqdm

	with open("title_link_langchaincourse.json", "r") as f:
	title_link_data = json.load(f)

	for md_file in tqdm(md_files):
	md_title = md_file["title"]
	md_raw_content = md_file["content"]
	md_header_splits = markdown_splitter.split_text(md_raw_content)

	title, link = get_title_link_from_md_title(
	md_title, title_link_data=title_link_data
	)

	for split in md_header_splits:
	# add the headers back to the content
	headers = "\n".join(
	[
	k + " " + v
	for k, v in zip(split.metadata.keys(), split.metadata.values())
	]
	)

	substrings = split_string_by_max_words(split.page_content, max_words=600)
	for substring in substrings:
	chunk = {
	"title": title,
	"content": headers + "\n" + substring,
	"source": "TAI Course",
	"url": link,
	}
	chunks.append(chunk)

	df = pd.DataFrame(chunks)

	df = drop_outlier_chunks(df, max_tokens_by_chunk=2000)

	print(f"Exported {len(df)} chunks from {len(md_files)} articles.")
	df.to_csv("langchain_course.csv")