import glob from langchain_text_splitters import MarkdownHeaderTextSplitter from langchain_community.document_loaders import UnstructuredMarkdownLoader path_to_data = "/data/" print(%pwd) def process_markdown(): headers_to_split_on = [ ("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3"), ("####", "Header 4"), ("#####", "Header 5") ] markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on) files = glob.glob(path_to_data+"*.md") docs = [] for file in files: try: loader = UnstructuredMarkdownLoader(file) data = loader.load() docs.append(data) except Exception as e: print("Exception: ", e) docs_processed = [markdown_splitter.split_text(doc) for doc in docs] print(len(docs_processed)) print(docs_processed[0])