import glob import os from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter from transformers import AutoTokenizer from langchain_community.document_loaders import PyMuPDFLoader path_to_data = "./data/" def process_pdf(): files = {'ABC':'./data/MWTS2021.pdf', 'XYZ':'./data/Consolidated2021.pdf'} docs = {} for file,value in files.items(): try: docs[file] = PyMuPDFLoader(value).load() except Exception as e: print("Exception: ", e) # text splitter based on the tokenizer of a model of your choosing # to make texts fit exactly a transformer's context window size # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/ chunk_size = 256 text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"), chunk_size=chunk_size, chunk_overlap=int(chunk_size / 10), add_start_index=True, strip_whitespace=True, separators=["\n\n", "\n", ".", " ", ""], )