dinhquangson commited on
Commit
142ca34
1 Parent(s): 16a13e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -10
app.py CHANGED
@@ -9,7 +9,7 @@ import os
9
  import streamlit as st
10
  from dotenv import load_dotenv
11
  from PyPDF2 import PdfReader
12
- from langchain.text_splitter import CharacterTextSplitter
13
  from langchain.embeddings import HuggingFaceBgeEmbeddings
14
  from langchain.vectorstores import FAISS
15
  from langchain.chat_models import ChatOpenAI
@@ -42,7 +42,7 @@ def get_pdf_text(pdf_docs):
42
  return text
43
 
44
 
45
- def get_text_chunks(text):
46
  """
47
  Split the input text into chunks.
48
 
@@ -57,11 +57,11 @@ def get_text_chunks(text):
57
  List of text chunks.
58
 
59
  """
60
- text_splitter = CharacterTextSplitter(
61
- separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
62
  )
63
- chunks = text_splitter.split_text(text)
64
- return chunks
65
 
66
 
67
  def get_vectorstore(text_chunks):
@@ -170,11 +170,8 @@ def main():
170
  )
171
  if st.button("Process"):
172
  with st.spinner("Processing"):
173
- # get pdf text
174
- raw_text = get_pdf_text(pdf_docs)
175
-
176
  # get the text chunks
177
- text_chunks = get_text_chunks(raw_text)
178
 
179
  # create vector store
180
  vectorstore = get_vectorstore(text_chunks)
 
9
  import streamlit as st
10
  from dotenv import load_dotenv
11
  from PyPDF2 import PdfReader
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
  from langchain.embeddings import HuggingFaceBgeEmbeddings
14
  from langchain.vectorstores import FAISS
15
  from langchain.chat_models import ChatOpenAI
 
42
  return text
43
 
44
 
45
+ def get_texts(pdf_pages):
46
  """
47
  Split the input text into chunks.
48
 
 
57
  List of text chunks.
58
 
59
  """
60
+ text_splitter = RecursiveCharacterTextSplitter(
61
+ chunk_size=1024, chunk_overlap=64
62
  )
63
+ texts = text_splitter.split_text(pdf_pages)
64
+ return texts
65
 
66
 
67
  def get_vectorstore(text_chunks):
 
170
  )
171
  if st.button("Process"):
172
  with st.spinner("Processing"):
 
 
 
173
  # get the text chunks
174
+ text_chunks = get_texts(pdf_docs)
175
 
176
  # create vector store
177
  vectorstore = get_vectorstore(text_chunks)