audit_assistant / auditqa /doc_process.py
ppsingh's picture
Update auditqa/doc_process.py
1de5e6e verified
raw
history blame
No virus
909 Bytes
import glob
import os
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader
path_to_data = "/data/"
cwd = os.getcwd()
print(cwd)
def process_markdown():
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
("####", "Header 4"),
("#####", "Header 5")
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
files = glob.glob(path_to_data+"*.md")
docs = []
for file in files:
try:
loader = UnstructuredMarkdownLoader(file)
data = loader.load()
docs.append(data)
except Exception as e:
print("Exception: ", e)
docs_processed = [markdown_splitter.split_text(doc) for doc in docs]
print(len(docs_processed))
print(docs_processed[0])