Shreyas094 commited on
Commit
ddc0536
1 Parent(s): adc46bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -5
app.py CHANGED
@@ -14,7 +14,7 @@ from langchain.prompts import PromptTemplate
14
  from langchain.chains import LLMChain
15
  from langchain_core.prompts import ChatPromptTemplate
16
  from langchain_community.vectorstores import FAISS
17
- from langchain_community.document_loaders import PyPDFLoader
18
  from langchain_core.output_parsers import StrOutputParser
19
  from langchain_community.embeddings import HuggingFaceEmbeddings
20
  from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -25,8 +25,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
25
  from sklearn.metrics.pairwise import cosine_similarity
26
  from openpyxl import load_workbook
27
  from openpyxl.utils.dataframe import dataframe_to_rows
28
-
29
-
30
 
31
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
32
 
@@ -55,6 +54,86 @@ def load_and_split_document_recursive(file: NamedTemporaryFile) -> List[Document
55
  chunks = text_splitter.split_documents(pages)
56
  return chunks
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  def get_embeddings():
59
  return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
60
 
@@ -682,7 +761,6 @@ def export_memory_db_to_excel():
682
 
683
  return excel_path
684
 
685
- # Gradio interface
686
  # Gradio interface
687
  with gr.Blocks() as demo:
688
  gr.Markdown("# Chat with your PDF documents and News")
@@ -691,9 +769,14 @@ with gr.Blocks() as demo:
691
  file_input = gr.Files(label="Upload your PDF documents", file_types=[".pdf"])
692
  update_button = gr.Button("Update Vector Store")
693
  use_recursive_splitter = gr.Checkbox(label="Use Recursive Text Splitter", value=False)
 
 
 
 
 
694
 
695
  update_output = gr.Textbox(label="Update Status")
696
- update_button.click(update_vectors, inputs=[file_input, use_recursive_splitter], outputs=update_output)
697
 
698
  with gr.Row():
699
  with gr.Column(scale=2):
 
14
  from langchain.chains import LLMChain
15
  from langchain_core.prompts import ChatPromptTemplate
16
  from langchain_community.vectorstores import FAISS
17
+ from langchain_community.document_loaders import PyPDFLoader, PDFMinerLoader
18
  from langchain_core.output_parsers import StrOutputParser
19
  from langchain_community.embeddings import HuggingFaceEmbeddings
20
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
25
  from sklearn.metrics.pairwise import cosine_similarity
26
  from openpyxl import load_workbook
27
  from openpyxl.utils.dataframe import dataframe_to_rows
28
+ import camelot
 
29
 
30
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
31
 
 
54
  chunks = text_splitter.split_documents(pages)
55
  return chunks
56
 
57
+ def load_and_split_document_basic(file: NamedTemporaryFile, parser: str) -> List[Document]:
58
+ """Loads and splits the document into pages."""
59
+ if parser == "PyPDF":
60
+ loader = PyPDFLoader(file.name)
61
+ elif parser == "PDFMiner":
62
+ loader = PDFMinerLoader(file.name)
63
+ elif parser == "Camelot":
64
+ return load_and_split_document_camelot(file)
65
+ else:
66
+ raise ValueError(f"Unknown parser: {parser}")
67
+
68
+ return loader.load_and_split()
69
+
70
+ def load_and_split_document_recursive(file: NamedTemporaryFile, parser: str) -> List[Document]:
71
+ """Loads and splits the document into chunks using recursive character text splitter."""
72
+ if parser == "PyPDF":
73
+ loader = PyPDFLoader(file.name)
74
+ elif parser == "PDFMiner":
75
+ loader = PDFMinerLoader(file.name)
76
+ elif parser == "Camelot":
77
+ return load_and_split_document_camelot(file)
78
+ else:
79
+ raise ValueError(f"Unknown parser: {parser}")
80
+
81
+ pages = loader.load()
82
+
83
+ text_splitter = RecursiveCharacterTextSplitter(
84
+ chunk_size=1000,
85
+ chunk_overlap=200,
86
+ length_function=len,
87
+ )
88
+
89
+ chunks = text_splitter.split_documents(pages)
90
+ return chunks
91
+
92
+ def load_and_split_document_camelot(file: NamedTemporaryFile) -> List[Document]:
93
+ """Loads and splits the document using Camelot for tables and charts."""
94
+ tables = camelot.read_pdf(file.name, pages='all')
95
+ documents = []
96
+
97
+ for i, table in enumerate(tables):
98
+ df = table.df
99
+ content = df.to_string(index=False)
100
+ documents.append(Document(page_content=content, metadata={"source": file.name, "table_number": i+1}))
101
+
102
+ return documents
103
+
104
+ def load_document(file: NamedTemporaryFile, parser: str, use_recursive_splitter: bool) -> List[Document]:
105
+ """Loads the document using the specified parser and splitting method."""
106
+ if parser == "Camelot":
107
+ return load_and_split_document_camelot(file)
108
+ elif use_recursive_splitter:
109
+ return load_and_split_document_recursive(file, parser)
110
+ else:
111
+ return load_and_split_document_basic(file, parser)
112
+
113
+ def update_vectors(files, use_recursive_splitter, selected_parser):
114
+ if not files:
115
+ return "Please upload at least one PDF file."
116
+
117
+ embed = get_embeddings()
118
+ total_chunks = 0
119
+
120
+ all_data = []
121
+ for file in files:
122
+ data = load_document(file, selected_parser, use_recursive_splitter)
123
+ all_data.extend(data)
124
+ total_chunks += len(data)
125
+
126
+ if os.path.exists("faiss_database"):
127
+ database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
128
+ database.add_documents(all_data)
129
+ else:
130
+ database = FAISS.from_documents(all_data, embed)
131
+
132
+ database.save_local("faiss_database")
133
+
134
+ splitting_method = "recursive splitting" if use_recursive_splitter else "page-by-page splitting"
135
+ return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {selected_parser} parser with {splitting_method}."
136
+
137
  def get_embeddings():
138
  return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
139
 
 
761
 
762
  return excel_path
763
 
 
764
  # Gradio interface
765
  with gr.Blocks() as demo:
766
  gr.Markdown("# Chat with your PDF documents and News")
 
769
  file_input = gr.Files(label="Upload your PDF documents", file_types=[".pdf"])
770
  update_button = gr.Button("Update Vector Store")
771
  use_recursive_splitter = gr.Checkbox(label="Use Recursive Text Splitter", value=False)
772
+ parser_dropdown = gr.Dropdown(
773
+ choices=["PyPDF", "PDFMiner", "Camelot"],
774
+ label="Select Parser",
775
+ value="PyPDF"
776
+ )
777
 
778
  update_output = gr.Textbox(label="Update Status")
779
+ update_button.click(update_vectors, inputs=[file_input, use_recursive_splitter, parser_dropdown], outputs=update_output)
780
 
781
  with gr.Row():
782
  with gr.Column(scale=2):