dinhquangson commited on
Commit
4ae7ab5
1 Parent(s): 142ca34

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -5
app.py CHANGED
@@ -10,6 +10,7 @@ import streamlit as st
10
  from dotenv import load_dotenv
11
  from PyPDF2 import PdfReader
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
13
  from langchain.embeddings import HuggingFaceBgeEmbeddings
14
  from langchain.vectorstores import FAISS
15
  from langchain.chat_models import ChatOpenAI
@@ -36,13 +37,14 @@ def get_pdf_text(pdf_docs):
36
  """
37
  text = ""
38
  for pdf in pdf_docs:
39
- pdf_reader = PdfReader(pdf)
40
- for page in pdf_reader.pages:
 
41
  text += page.extract_text()
42
  return text
43
 
44
 
45
- def get_texts(pdf_pages):
46
  """
47
  Split the input text into chunks.
48
 
@@ -60,7 +62,7 @@ def get_texts(pdf_pages):
60
  text_splitter = RecursiveCharacterTextSplitter(
61
  chunk_size=1024, chunk_overlap=64
62
  )
63
- texts = text_splitter.split_text(pdf_pages)
64
  return texts
65
 
66
 
@@ -170,8 +172,11 @@ def main():
170
  )
171
  if st.button("Process"):
172
  with st.spinner("Processing"):
 
 
 
173
  # get the text chunks
174
- text_chunks = get_texts(pdf_docs)
175
 
176
  # create vector store
177
  vectorstore = get_vectorstore(text_chunks)
 
10
  from dotenv import load_dotenv
11
  from PyPDF2 import PdfReader
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ from langchain.document_loaders import UnstructuredPDFLoader
14
  from langchain.embeddings import HuggingFaceBgeEmbeddings
15
  from langchain.vectorstores import FAISS
16
  from langchain.chat_models import ChatOpenAI
 
37
  """
38
  text = ""
39
  for pdf in pdf_docs:
40
+ pdf_loader = UnstructuredPDFLoader(pdf)
41
+ pdf_pages = pdf_loader.load_and_split()
42
+ for page in pdf_pages:
43
  text += page.extract_text()
44
  return text
45
 
46
 
47
+ def get_text_chunks(text):
48
  """
49
  Split the input text into chunks.
50
 
 
62
  text_splitter = RecursiveCharacterTextSplitter(
63
  chunk_size=1024, chunk_overlap=64
64
  )
65
+ texts = text_splitter.split_text(text)
66
  return texts
67
 
68
 
 
172
  )
173
  if st.button("Process"):
174
  with st.spinner("Processing"):
175
+ # get the raw text
176
+ text = get_pdf_text(pdf_docs)
177
+
178
  # get the text chunks
179
+ text_chunks = get_text_chunks(text)
180
 
181
  # create vector store
182
  vectorstore = get_vectorstore(text_chunks)