ArturG9 commited on
Commit
096cbf6
1 Parent(s): 8ac6fa5

Update functions.py

Browse files
Files changed (1) hide show
  1. functions.py +25 -33
functions.py CHANGED
@@ -69,25 +69,22 @@ async def handle_userinput(user_question, custom_graph):
69
 
70
 
71
 
72
- def create_retriever_from_chroma(vectorstore_path="./docs/chroma/", search_type='mmr', k=7, chunk_size=300, chunk_overlap=30,lambda_mult= 0.7):
73
-
74
  model_name = "Alibaba-NLP/gte-large-en-v1.5"
75
- model_kwargs = {'device': 'cpu',
76
- "trust_remote_code" : 'False'}
77
  encode_kwargs = {'normalize_embeddings': True}
 
78
  embeddings = HuggingFaceEmbeddings(
79
  model_name=model_name,
80
  model_kwargs=model_kwargs,
81
  encode_kwargs=encode_kwargs
82
  )
83
 
84
-
85
-
86
  if os.path.exists(vectorstore_path) and os.listdir(vectorstore_path):
87
- vectorstore = Chroma(persist_directory=vectorstore_path,embedding_function=embeddings)
88
-
89
  else:
90
- st.write("Vector store doesnt exist and will be created now")
 
91
  urls = [
92
 
93
  "https://github.com/zedr/clean-code-python",
@@ -190,38 +187,33 @@ def create_retriever_from_chroma(vectorstore_path="./docs/chroma/", search_type=
190
  "https://datasciencedojo.com/blog/ensemble-methods-in-machine-learning/",
191
  "https://datasciencedojo.com/blog/langgraph-tutorial/",
192
  "https://datasciencedojo.com/blog/data-driven-marketing-in-2024/",
193
- "https://datasciencedojo.com/blog/on-device-ai/"
194
- ]
195
 
196
- def extract_sentences_from_web(links, chunk_size=500, chunk_overlap=30):
197
- data = []
198
- for link in links:
199
- loader = NewsURLLoader(urls=[link])
200
- data += loader.load()
201
- return data
 
 
202
 
 
203
 
204
- docs = extract_sentences_from_web(links=urls)
205
-
206
-
207
- text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
208
- chunk_size=chunk_size, chunk_overlap=chunk_overlap,
209
- separators=["\n\n \n\n","\n\n\n", "\n\n", r"In \[[0-9]+\]", r"\n+", r"\s+"],
210
- is_separator_regex = True
211
- )
212
- split_docs = text_splitter.split_documents(docs)
213
 
214
-
215
- vectorstore = Chroma.from_documents(
216
  documents=split_docs, embedding=embeddings, persist_directory=vectorstore_path
217
  )
218
-
219
-
220
- retriever=vectorstore.as_retriever(search_type = search_type, search_kwargs={"k": k})
221
 
 
222
 
223
-
224
-
225
  return retriever
226
 
227
 
 
69
 
70
 
71
 
72
+ def create_retriever_from_chroma(vectorstore_path="./docs/chroma/", search_type='mmr', k=7, chunk_size=300, chunk_overlap=30, lambda_mult=0.7):
 
73
  model_name = "Alibaba-NLP/gte-large-en-v1.5"
74
+ model_kwargs = {'device': 'cpu', "trust_remote_code": 'False'}
 
75
  encode_kwargs = {'normalize_embeddings': True}
76
+
77
  embeddings = HuggingFaceEmbeddings(
78
  model_name=model_name,
79
  model_kwargs=model_kwargs,
80
  encode_kwargs=encode_kwargs
81
  )
82
 
 
 
83
  if os.path.exists(vectorstore_path) and os.listdir(vectorstore_path):
84
+ vectorstore = Chroma(persist_directory=vectorstore_path, embedding_function=embeddings)
 
85
  else:
86
+ st.write("Vector store doesn't exist and will be created now")
87
+
88
  urls = [
89
 
90
  "https://github.com/zedr/clean-code-python",
 
187
  "https://datasciencedojo.com/blog/ensemble-methods-in-machine-learning/",
188
  "https://datasciencedojo.com/blog/langgraph-tutorial/",
189
  "https://datasciencedojo.com/blog/data-driven-marketing-in-2024/",
190
+ "https://datasciencedojo.com/blog/on-device-ai/",
191
+
192
 
193
+ ]
194
+
195
+ def extract_sentences_from_web(links, chunk_size=500, chunk_overlap=30):
196
+ data = []
197
+ for link in links:
198
+ loader = NewsURLLoader(urls=[link])
199
+ data += loader.load()
200
+ return data
201
 
202
+ docs = extract_sentences_from_web(links=urls)
203
 
204
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
205
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap,
206
+ separators=["\n\n \n\n", "\n\n\n", "\n\n", r"In \[[0-9]+\]", r"\n+", r"\s+"],
207
+ is_separator_regex=True
208
+ )
209
+ split_docs = text_splitter.split_documents(docs)
 
 
 
210
 
211
+ vectorstore = Chroma.from_documents(
 
212
  documents=split_docs, embedding=embeddings, persist_directory=vectorstore_path
213
  )
 
 
 
214
 
215
+ retriever = vectorstore.as_retriever(search_type=search_type, search_kwargs={"k": k})
216
 
 
 
217
  return retriever
218
 
219