ppsingh commited on
Commit
9ca031b
1 Parent(s): 84c6152

Update auditqa/doc_process.py

Browse files
Files changed (1) hide show
  1. auditqa/doc_process.py +2 -2
auditqa/doc_process.py CHANGED
@@ -27,11 +27,11 @@ def process_pdf():
27
  # text splitter based on the tokenizer of a model of your choosing
28
  # to make texts fit exactly a transformer's context window size
29
  # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
30
- chunk_size = 256
31
  text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
32
  AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
33
  chunk_size=chunk_size,
34
- chunk_overlap=int(chunk_size / 10),
35
  add_start_index=True,
36
  strip_whitespace=True,
37
  separators=["\n\n", "\n"],
 
27
  # text splitter based on the tokenizer of a model of your choosing
28
  # to make texts fit exactly a transformer's context window size
29
  # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
30
+ chunk_size = 512
31
  text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
32
  AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
33
  chunk_size=chunk_size,
34
+ chunk_overlap=10,
35
  add_start_index=True,
36
  strip_whitespace=True,
37
  separators=["\n\n", "\n"],