File size: 909 Bytes
44f0ae2
1de5e6e
44f0ae2
 
4ac53ed
1de5e6e
 
44f0ae2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import glob
import os
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader
path_to_data = "/data/"
cwd = os.getcwd()
print(cwd)
def process_markdown():
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
        ("####", "Header 4"),
        ("#####", "Header 5")
    ]
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    
    files = glob.glob(path_to_data+"*.md")
    docs = []
    for file in files:
      try:
          loader = UnstructuredMarkdownLoader(file)
          data = loader.load()
          docs.append(data)
      except Exception as e:
        print("Exception: ", e)
    docs_processed = [markdown_splitter.split_text(doc) for doc in docs]
    print(len(docs_processed))
    print(docs_processed[0])