ArturG9 commited on
Commit
152b9b0
1 Parent(s): 852ae92

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -36
app.py CHANGED
@@ -18,57 +18,56 @@ from langchain_community.document_loaders.directory import DirectoryLoader
18
  from HTML_templates import css, bot_template, user_template
19
 
20
 
 
21
 
22
-
23
- def retriever_from_chroma(docs, search_type, k):
24
  model_name = "sentence-transformers/all-mpnet-base-v2"
25
  model_kwargs = {'device': 'cpu'}
26
  encode_kwargs = {'normalize_embeddings': True}
 
 
27
  embeddings = HuggingFaceEmbeddings(
28
  model_name=model_name,
29
  model_kwargs=model_kwargs,
30
  encode_kwargs=encode_kwargs
31
  )
32
- vectorstore_path = "docs/chroma/"
33
- if not os.path.exists(vectorstore_path):
34
- os.makedirs(vectorstore_path)
35
- vectorstore = Chroma.from_documents(
36
- documents=docs, embedding=embeddings, persist_directory="docs/chroma/")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  retriever = vectorstore.as_retriever(search_type=search_type, search_kwargs={"k": k})
38
  return retriever
39
-
40
-
41
-
42
-
43
- data_path = "data"
44
-
45
-
46
-
47
- documents = []
48
-
49
- for filename in os.listdir(data_path):
50
-
51
- if filename.endswith('.txt'):
52
-
53
- file_path = os.path.join(data_path, filename)
54
-
55
- documents = TextLoader(file_path).load()
56
-
57
- documents.extend(documents)
58
-
59
-
60
-
61
-
62
- docs = split_docs(documents, 250, 20)
63
-
64
- retriever = retriever_from_chroma(docs,'mmr',7)
65
 
66
 
67
 
68
 
69
 
70
 
71
- def main(retriever):
72
 
73
  st.set_page_config(page_title="Chat with multiple PDFs",
74
  page_icon=":books:")
@@ -90,7 +89,8 @@ def main(retriever):
90
  handle_userinput(user_question,vectorstore)
91
 
92
 
93
- def handle_userinput(user_question,retriever):
 
94
  docs = retriever.invoke(question)
95
 
96
  doc_txt = [doc.page_content for doc in docs]
@@ -135,4 +135,4 @@ def create_conversational_rag_chain(retriever):
135
 
136
 
137
  if __name__ == "__main__":
138
- main(vectorstore)
 
18
  from HTML_templates import css, bot_template, user_template
19
 
20
 
21
+ data_path = "data"
22
 
23
+ def create_retriever_from_chroma(data_path, vectorstore_path="docs/chroma/", search_type='mmr', k=7, chunk_size=250, chunk_overlap=20):
 
24
  model_name = "sentence-transformers/all-mpnet-base-v2"
25
  model_kwargs = {'device': 'cpu'}
26
  encode_kwargs = {'normalize_embeddings': True}
27
+
28
+ # Initialize embeddings
29
  embeddings = HuggingFaceEmbeddings(
30
  model_name=model_name,
31
  model_kwargs=model_kwargs,
32
  encode_kwargs=encode_kwargs
33
  )
34
+
35
+ # Check if vectorstore exists
36
+ if os.path.exists(vectorstore_path) and os.listdir(vectorstore_path):
37
+ # Load the existing vectorstore
38
+ vectorstore = Chroma(persist_directory=vectorstore_path, embedding=embeddings)
39
+ else:
40
+ # Load documents from the specified data path
41
+ documents = []
42
+ for filename in os.listdir(data_path):
43
+ if filename.endswith('.txt'):
44
+ file_path = os.path.join(data_path, filename)
45
+ loaded_docs = TextLoader(file_path).load()
46
+ documents.extend(loaded_docs)
47
+
48
+ # Split documents into chunks
49
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
50
+ split_docs = text_splitter.split_documents(documents)
51
+
52
+ # Ensure the directory for storing vectorstore exists
53
+ if not os.path.exists(vectorstore_path):
54
+ os.makedirs(vectorstore_path)
55
+
56
+ # Create the vectorstore
57
+ vectorstore = Chroma.from_documents(
58
+ documents=split_docs, embedding=embeddings, persist_directory=vectorstore_path
59
+ )
60
+
61
+ # Create and return the retriever
62
  retriever = vectorstore.as_retriever(search_type=search_type, search_kwargs={"k": k})
63
  return retriever
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
 
66
 
67
 
68
 
69
 
70
+ def main():
71
 
72
  st.set_page_config(page_title="Chat with multiple PDFs",
73
  page_icon=":books:")
 
89
  handle_userinput(user_question,vectorstore)
90
 
91
 
92
+ def handle_userinput(user_question):
93
+ retriever = create_retriever_from_chroma(data_path, vectorstore_path="docs/chroma/", search_type='mmr', k=7, chunk_size=250, chunk_overlap=20)
94
  docs = retriever.invoke(question)
95
 
96
  doc_txt = [doc.page_content for doc in docs]
 
135
 
136
 
137
  if __name__ == "__main__":
138
+ main()