Rajat.bans commited on
Commit
ead6614
1 Parent(s): 09c3be4

Upgraded ads rag

Browse files
rag.py CHANGED
@@ -7,19 +7,110 @@ import random
7
  import pandas as pd
8
  import os
9
  import json
10
-
11
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
 
12
 
13
  load_dotenv(override=True)
14
- client = OpenAI()
15
- DB_FAISS_PATH = "./vectorstore/db_faiss_ads_20May_20Jun_webmd_healthline_Health_dupRemoved0.8"
16
  data_file_path = "./data/142_adclick_20May_20Jun_webmd_healthline_Health_dupRemoved0.8_someAdsCampaign.tsv"
17
  embedding_model_hf = "BAAI/bge-m3"
18
- qa_model_name = "gpt-3.5-turbo"
19
- default_threshold = 0.75
20
- relation_check_best_value_thresh = 0.6
21
- number_of_ads_to_fetch_from_db = 15
22
- bestRelationSystemPrompt = """You are an advertising concierge for text ads on websites. Given an INPUT(PAGE_TITLE) and the available ad inventory (ADS_DATA), your task is to determine whether there are some relevant ADS to INPUT are present in ADS_DATA. ADS WHICH DON'T MATCH USER'S INTENT SHOULD BE CONSIDERED IRRELEVANT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  ---------------------------------------
25
 
@@ -43,12 +134,12 @@ Expected json output :
43
 
44
  The ADS_DATA provided to you is as follows:
45
 
46
- """
47
 
48
- bestQuestionSystemPrompt = """1. You are an advertising concierge for text ads on websites. Given an INPUT(PAGE_TITLE) and the available ad inventory (ADS_DATA), your task is to form a relevant QUESTION to ask the user visiting the webpage. This question should help identify the user's intent behind visiting the webpage.
49
- 2. From the ADS_DATA, discard all ads that are not related to the INPUT or do not match the user's intent behind visiting the page. Also, remove any ads that are distantly related to the user's intent.
50
- 3. FROM REMAINING ADS ONLY, group together those that are similar in type. For each grouped ads form an OPTION which should be both the answer for the QUESTION and related to ads in this group.
51
- 4. Try to generate QUESTION within 70 characters and keep either 2, 3 or 4 number of OPTIONS.
52
  5. Provide your REASONING behind choosing the QUESTION and the OPTIONS. Now provide the QUESTION and the OPTIONS. Along with each OPTION, provide the ads from ADS_DATA that you associated with it.
53
 
54
  ---------------------------------------
@@ -57,19 +148,13 @@ bestQuestionSystemPrompt = """1. You are an advertising concierge for text ads o
57
  The Effects of Aging on Skin
58
 
59
  <Sample ADS_DATA>
60
- Ad 1: Forget Retinol, Use This Household Item To Fill In Wrinkles - Celebrities Are Ditching Pricey Facelifts For This.
61
- Ad 2: Stop Covering Your Wrinkles with Make Up - Do This Instead.
62
- Ad 3: Living With Migraines? - Discover A Treatment Option. Learn about a type of prescription migraine treatment called CGRP receptor antagonists. Discover a range of resources that may help people dealing with migraines.
63
- Ad 4: What is Advanced Skin Cancer? - Find Disease Information Here.Find Facts About Advanced Skin Cancer and a Potential Treatment Option.
64
- Ad 5: Learn About Advanced Melanoma - Find Disease Information Here.Find Facts About Advanced Melanoma and a Potential Treatment Option.
65
- Ad 6: Treatment For CKD - Reduce Risk Of Progressing CKD. Ask About A Treatment That Can Help Reduce Your Risk Of Kidney Failure.
66
- Ad 7: Are You Living With Vitiligo? - For Patients & Caregivers.Discover An FDA-Approved Topical Cream That May Help With Nonsegmental Vitiligo Repigmentation. Learn About A Copay Savings Card For Eligible Patients With Vitiligo.
67
 
68
  <Expected json output>
69
  {
70
- "reasoning" : "Among the seven ads in **Sample ADS_DATA**, Ads 3 and 6 are irrelevant to the INPUT, so they should be discarded. Ad 1, 2 closely aligns with the user's intent and are mutually exclusive, so they can be presented as two separate options. Ads 4, 5, and 7 can be grouped into a single option, since they are similar and also relevant to INPUT. The question will be formed in a way to connect the PAGE TITLE content with the goals of these five relevant ads, making sure they appeal to both specific and general user interests.",
71
  "question": "Which of the following methods to combat aging skin are you most interested in?",
72
- "options": {"1. Retinol Alternatives for Wrinkle Treatment." : ["Ad 2: Forget Retinol, Use This Household Item To Fill In Wrinkles - Celebrities Are Ditching Pricey Facelifts For This."], "2. Reduce Wrinkles without Makeup.": ["Ad 1: Stop Covering Your Wrinkles with Make Up - Do This Instead."], "3. Information on Skin Diseases": ["Ad 7: Are You Living With Vitiligo? - For Patients & Caregivers.Discover An FDA-Approved Topical Cream That May Help With Nonsegmental Vitiligo Repigmentation. Learn About A Copay Savings Card For Eligible Patients With Vitiligo.", "Ad 4: What is Advanced Skin Cancer? - Find Disease Information Here.Find Facts About Advanced Skin Cancer and a Potential Treatment Option.", "Ad 5: Learn About Advanced Melanoma - Find Disease Information Here.Find Facts About Advanced Melanoma and a Potential Treatment Option."]}
73
  }
74
  -----------------------------------------------
75
 
@@ -88,189 +173,186 @@ Got A Rosemary Bush? Here’re 20 Brilliant & Unusual Ways To Use All That Rosem
88
  -----------------------------------------------
89
 
90
  The ADS_DATA provided to you is as follows:
91
- """
92
-
93
- old_system_prompt_additional_example = """
94
- -----------------------------------------------
95
- <Sample INPUT(PAGE_TITLE)>
96
- 7 Signs and Symptoms of Magnesium Deficiency
97
-
98
- <Sample ADS_DATA>
99
- Ad 1: 4 Warning Signs Of Dementia - Fight Dementia and Memory Loss. 100% Natural Program To Prevent Cognitive Decline. Developed By Dr. Will Mitchell. Read The Reviews-Get a Special Offer. Doctor Recommended. High Quality Standards. 60-Day Refund.
100
- Ad 2: About Hyperkalemia - Learn About The Symptoms. High Potassium Can Be A Serious Condition. Learn More About Hyperkalemia Today.
101
- Ad 3: Weak or Paralyzed Muscles? - A Common Symptom of Cataplexy. About 70% of People With Narcolepsy Are Believed to Have Cataplexy Symptoms. Learn More. Download the Doctor Discussion Guide to Have a Informed Conversation About Your Health.
102
-
103
- <Expected json output>
104
- {
105
- "reasoning" : "Given the input '7 Signs and Symptoms of Magnesium Deficiency,' it is evident that the user is looking for information specifically about magnesium deficiency. Ads 1, 2, and 3 discuss topics such as dementia, hyperkalemia, weak muscles, which are not related to magnesium deficiency in any way. Therefore, all the ads in the ADS_DATA are not suitable for the user's query and will be discarded.",
106
- "question": "No related ads available to form question and options.",
107
- "options": []
108
- }
109
- ------------------------------------------------
110
- """
111
-
112
- embeddings_hf = HuggingFaceEmbeddings(model_name=embedding_model_hf)
113
-
114
-
115
- def getBestQuestionOnTheBasisOfPageInformationAndAdsData(
116
- page_information,
117
- adsData,
118
- relationSystemPrompt,
119
- questionSystemPrompt,
120
- bestRetreivedAdValue,
121
- ):
122
- if adsData == "":
123
- return ({"reasoning": "No ads data present", "classification": 0}, 0), (
124
- {"reasoning": "", "question": "", "options": []},
125
- 0,
126
- )
127
-
128
- relation_answer = {"reasoning": "", "classification": 1}
129
- question_answer = {"reasoning": "", "question": "", "options": []}
130
- tokens_used_relation = 0
131
- tokens_used_question = 0
132
- while True:
133
- try:
134
- if bestRetreivedAdValue > relation_check_best_value_thresh:
135
- system_message = {
136
- "role": "system",
137
- "content": relationSystemPrompt + adsData,
138
- }
139
- response = client.chat.completions.create(
140
- model=qa_model_name,
141
- messages=[system_message]
142
- + [
143
- {
144
- "role": "user",
145
- "content": page_information + "\nThe JSON response: ",
146
- }
147
- ],
148
- temperature=0,
149
- seed=42,
150
- max_tokens=1000,
151
- response_format={"type": "json_object"},
152
- )
153
- tokens_used_relation = response.usage.total_tokens
154
- relation_answer = json.loads(response.choices[0].message.content)
155
- tokens_used_question = 0
156
- else:
157
- relation_answer["reasoning"] = "First retreived document value less than threshold so no need to check relation"
158
-
159
- if relation_answer["classification"] != 0:
160
- system_message = {
161
- "role": "system",
162
- "content": questionSystemPrompt + adsData,
163
- }
164
- response = client.chat.completions.create(
165
- model=qa_model_name,
166
- messages=[system_message]
167
- + [
168
- {
169
- "role": "user",
170
- "content": page_information + "\nThe JSON response: ",
171
- }
172
- ],
173
  temperature=0,
174
  seed=42,
175
  max_tokens=1000,
176
  response_format={"type": "json_object"},
177
  )
178
- tokens_used_question = response.usage.total_tokens
179
- question_answer = json.loads(response.choices[0].message.content)
180
- break
181
- except Exception as e:
182
- print("Error-: ", e.message)
183
- print("Trying Again")
184
- return (relation_answer, tokens_used_relation), (
185
- question_answer,
186
- tokens_used_question,
187
- )
188
-
189
-
190
- def changeResponseToPrintableString(response, task):
191
- if task == "relation":
192
- return f"Reasoning: {response['reasoning']}\n\nClassification: {response['classification']}\n"
193
- res = f"Reasoning: {response['reasoning']}\n\nQuestion: {response['question']}\n\nOptions: \n"
194
- for option in response["options"]:
195
- res += f"{option}\n"
196
- for ad in response["options"][option]:
197
- res += f"{ad}\n"
198
- res += "\n"
199
- return res
200
-
201
 
202
- def getRagResponse(RelationPrompt, QuestionPrompt, threshold, page_information):
203
- curr_relation_prompt = bestRelationSystemPrompt
204
- if RelationPrompt != None or len(RelationPrompt):
205
- curr_relation_prompt = RelationPrompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
- curr_question_prompt = bestQuestionSystemPrompt
208
- if QuestionPrompt != None or len(QuestionPrompt):
209
- curr_question_prompt = QuestionPrompt
 
 
210
 
211
- retreived_documents = [
212
- doc
213
- for doc in db.similarity_search_with_score(
214
- page_information, k=number_of_ads_to_fetch_from_db
215
- )
216
- if doc[1] < threshold
217
- ]
218
- best_value = 1
219
- if len(retreived_documents):
220
- best_value = retreived_documents[0][1]
221
- relation_answer, question_answer = (
222
- getBestQuestionOnTheBasisOfPageInformationAndAdsData(
223
- page_information,
224
- ".\n".join(
225
- [
226
- "Ad " + str(i + 1) + ". " + doc[0].page_content
227
- for i, doc in enumerate(retreived_documents)
228
  ]
229
- ),
230
- curr_relation_prompt,
231
- curr_question_prompt,
232
- best_value,
 
 
 
 
 
233
  )
234
- )
235
- print("QUERY:", page_information, relation_answer, question_answer)
236
- docs_info = "\n\n".join(
237
- [
238
- # f"Publisher url: {doc[0].metadata['publisher_url']}\nKeyword Term: {doc[0].metadata['keyword_term']}\nAd Display Url: {doc[0].metadata['ad_display_url']}\nRevenue: {doc[0].metadata['revenue']}\nAd Click Count: {doc[0].metadata['ad_click_count']}\nContent: {doc[0].page_content}\nValue: {doc[1]}"
239
- f"{i+1}. Content: {doc[0].page_content}\nRevenue: {doc[0].metadata['revenue']}\nAd Click Count: {doc[0].metadata['ad_click_count']}\nValue: {doc[1]}"
240
- for i, doc in enumerate(retreived_documents)
241
- ]
242
- )
243
- try:
244
- relation_answer_string = changeResponseToPrintableString(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  relation_answer[0], "relation"
246
  )
247
- question_answer_string = changeResponseToPrintableString(
248
  question_answer[0], "question"
249
  )
250
- full_response = f"**ANSWER**: \n Relation answer:\n {relation_answer_string}\n Question answer:\n {question_answer_string}\n\n**RETREIVED DOCUMENTS**:\n{docs_info}\n\n**TOKENS USED**:\nQuestion api call: {question_answer[1]}\nRelation api call: {relation_answer[1]}"
251
- except:
252
- full_response = f"Invalid response received"
253
- return full_response
254
 
255
 
256
- db = FAISS.load_local(
257
- DB_FAISS_PATH, embeddings_hf, allow_dangerous_deserialization=True
258
- )
259
  data = pd.read_csv(data_file_path, sep="\t")
260
  # data.dropna(axis=0, how="any", inplace=True)
261
  data.drop_duplicates(subset=["ad_title", "ad_desc"], inplace=True)
262
  ad_title_content = list(data["ad_title"].values)
 
263
  with gr.Blocks() as demo:
264
  gr.Markdown("# RAG on ads data")
265
  with gr.Row():
266
  RelationPrompt = gr.Textbox(
267
- bestRelationSystemPrompt,
268
  lines=1,
269
  placeholder="Enter the relation system prompt for relation check",
270
  label="Relation System prompt",
271
  )
272
  QuestionPrompt = gr.Textbox(
273
- bestQuestionSystemPrompt,
274
  lines=1,
275
  placeholder="Enter the question system prompt for question formulation",
276
  label="Question System prompt",
@@ -279,18 +361,18 @@ with gr.Blocks() as demo:
279
  lines=1, placeholder="Enter the page information", label="Page Information"
280
  )
281
  threshold = gr.Number(
282
- value=default_threshold, label="Threshold", interactive=True
283
  )
284
  output = gr.Textbox(label="Output")
285
  submit_btn = gr.Button("Submit")
286
 
287
  submit_btn.click(
288
- getRagResponse,
289
  inputs=[RelationPrompt, QuestionPrompt, threshold, page_information],
290
  outputs=[output],
291
  )
292
  page_information.submit(
293
- getRagResponse,
294
  inputs=[RelationPrompt, QuestionPrompt, threshold, page_information],
295
  outputs=[output],
296
  )
 
7
  import pandas as pd
8
  import os
9
  import json
10
+ from sklearn.cluster import KMeans, SpectralClustering
11
+ from scipy.spatial.distance import euclidean
12
+ import re
13
+ import numpy as np
14
+ from itertools import count
15
 
16
  load_dotenv(override=True)
17
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
18
  data_file_path = "./data/142_adclick_20May_20Jun_webmd_healthline_Health_dupRemoved0.8_someAdsCampaign.tsv"
19
  embedding_model_hf = "BAAI/bge-m3"
20
+ embeddings_hf = HuggingFaceEmbeddings(model_name=embedding_model_hf)
21
+
22
+
23
+ class CLUSTERING:
24
+ def __init__(self):
25
+ self.clustering_algo = 'kmeans-cc' # ['kmeans-cc', 'kmeans-sp', 'spectral_clustering']
26
+
27
+ def cluster_embeddings(self, embeddings, no_of_clusters, no_of_points):
28
+ if self.clustering_algo in {"kmeans-cc", "kmeans-sp"}:
29
+ kmeans = KMeans(n_clusters=no_of_clusters, random_state=42)
30
+ kmeans.fit(embeddings)
31
+ cluster_centers = kmeans.cluster_centers_
32
+ labels = kmeans.labels_
33
+
34
+ if self.clustering_algo == "kmeans-cc":
35
+ clusters_indices = [[] for _ in range(no_of_clusters)]
36
+ for i, embedding in enumerate(embeddings):
37
+ cluster_idx = labels[i]
38
+ center = cluster_centers[cluster_idx]
39
+ dist = euclidean(embedding, center)
40
+ clusters_indices[cluster_idx].append((i, dist))
41
+ for i in range(no_of_clusters):
42
+ clusters_indices[i].sort(key=lambda x: x[1])
43
+ else:
44
+ clusters_indices = [[] for _ in range(no_of_clusters)]
45
+ for i, label in enumerate(labels):
46
+ if len(clusters_indices[label]) < no_of_points:
47
+ clusters_indices[label].append(i)
48
+ if all(len(cluster) == no_of_points for cluster in clusters_indices):
49
+ break
50
+ elif self.clustering_algo == "spectral":
51
+ spectral_clustering = SpectralClustering(n_clusters=no_of_clusters, affinity='nearest_neighbors', random_state=42)
52
+ labels = spectral_clustering.fit_predict(embeddings)
53
+
54
+ clusters_indices = [[] for _ in range(no_of_clusters)]
55
+ for i, label in enumerate(labels):
56
+ if len(clusters_indices[label]) < no_of_points:
57
+ clusters_indices[label].append(i)
58
+ if all(len(cluster) == no_of_points for cluster in clusters_indices):
59
+ break
60
+ return [
61
+ [cluster_point[0] for cluster_point in clusters_indices[i][:no_of_points]]
62
+ for i in range(no_of_clusters)
63
+ ]
64
+
65
+ class VECTOR_DB:
66
+ def __init__(self):
67
+ self.DB_FAISS_PATH = "./vectorstore/db_faiss_ads_20May_20Jun_webmd_healthline_Health_dupRemoved0.8"
68
+ self.default_threshold = 0.75
69
+ self.number_of_ads_to_fetch_from_db = 50
70
+ self.no_of_clusters = 3
71
+ self.no_of_ads_in_each_cluster = 6
72
+ self.db = FAISS.load_local(
73
+ self.DB_FAISS_PATH, embeddings_hf, allow_dangerous_deserialization=True
74
+ )
75
+
76
+ def queryVectorDB(self, page_information, threshold):
77
+ def remove_html_tags(text):
78
+ clean = re.compile("<.*?>")
79
+ return re.sub(clean, "", text)
80
+
81
+ retreived_documents = [
82
+ doc
83
+ for doc in self.db.similarity_search_with_score(
84
+ page_information, k=self.number_of_ads_to_fetch_from_db
85
+ )
86
+ if doc[1] < threshold
87
+ ]
88
+ for i in range(len(retreived_documents)):
89
+ retreived_documents[i][0].page_content = remove_html_tags(
90
+ retreived_documents[i][0].page_content
91
+ )
92
+ embeddings = np.array(embeddings_hf.embed_documents([doc[0].page_content for doc in retreived_documents]))
93
+
94
+ clustered_indices = CLUSTERING().cluster_embeddings(
95
+ embeddings, self.no_of_clusters, self.no_of_ads_in_each_cluster
96
+ )
97
+ documents_clusters = [
98
+ [retreived_documents[ind] for ind in cluster_indices]
99
+ for cluster_indices in clustered_indices
100
+ ]
101
+
102
+ best_value = 1
103
+ if len(retreived_documents):
104
+ best_value = retreived_documents[0][1]
105
+ return documents_clusters, best_value
106
+
107
+ class ADS_RAG:
108
+ def __init__(self):
109
+ self.client = OpenAI()
110
+ self.db = VECTOR_DB()
111
+ self.qa_model_name = "gpt-3.5-turbo"
112
+ self.relation_check_best_value_thresh = 0.6
113
+ self.bestRelationSystemPrompt = """You are an advertising concierge for text ads on websites. Given an INPUT(PAGE_TITLE) and the available ad inventory (ADS_DATA), your task is to determine whether there are some relevant ADS to INPUT are present in ADS_DATA. ADS WHICH DON'T MATCH USER'S INTENT SHOULD BE CONSIDERED IRRELEVANT
114
 
115
  ---------------------------------------
116
 
 
134
 
135
  The ADS_DATA provided to you is as follows:
136
 
137
+ """
138
 
139
+ self.bestQuestionSystemPrompt = """1. You are an advertising concierge for text ads on websites. Given an INPUT(PAGE_TITLE) and the available ad inventory (ADS_DATA), your task is to form a relevant QUESTION to ask the user visiting the webpage. This question should help identify the user's intent behind visiting the webpage.
140
+ 2. From the ADS_DATA clusters, discard all ads that are not related to the INPUT or do not match the user's intent behind visiting the page. Also, remove any ads that are distantly related to the user's intent.
141
+ 3. FROM REMAINING ADS in each ads cluster form an OPTION which should be both the answer for the QUESTION and related to ads in this cluster.
142
+ 4. Try to generate intelligent creatives for advertising and keep QUESTION within 70 characters and each OPTION with either 4, 5, or 6 words.
143
  5. Provide your REASONING behind choosing the QUESTION and the OPTIONS. Now provide the QUESTION and the OPTIONS. Along with each OPTION, provide the ads from ADS_DATA that you associated with it.
144
 
145
  ---------------------------------------
 
148
  The Effects of Aging on Skin
149
 
150
  <Sample ADS_DATA>
151
+ [{"Ad 1": "Forget Retinol, Use This Household Item To Fill In Wrinkles - Celebrities Are Ditching Pricey Facelifts For This.", "Ad 2": "Stop Covering Your Wrinkles with Make Up - Do This Instead."}, {"Ad 3": "Living With Migraines? - Discover A Treatment Option. Learn about a type of prescription migraine treatment called CGRP receptor antagonists. Discover a range of resources that may help people dealing with migraines."}, {"Ad 4": "What is Advanced Skin Cancer? - Find Disease Information Here.Find Facts About Advanced Skin Cancer and a Potential Treatment Option.", "Ad 5": "Learn About Advanced Melanoma - Find Disease Information Here.Find Facts About Advanced Melanoma and a Potential Treatment Option.", "Ad 6": "Treatment For CKD - Reduce Risk Of Progressing CKD. Ask About A Treatment That Can Help Reduce Your Risk Of Kidney Failure.", "Ad 7": "Are You Living With Vitiligo? - For Patients & Caregivers.Discover An FDA-Approved Topical Cream That May Help With Nonsegmental Vitiligo Repigmentation. Learn About A Copay Savings Card For Eligible Patients With Vitiligo."}]
 
 
 
 
 
 
152
 
153
  <Expected json output>
154
  {
155
+ "reasoning" : "Among the seven ads in **Sample ADS_DATA**, Ads 3 and 6 are irrelevant to the INPUT, so they should be discarded. Ad 1, 2, 4, 5, and 7 are relevant to INPUT. The question will be formed in a way to connect the PAGE TITLE content with the goals of these five relevant ads, making sure they appeal to both specific and general user interests.",
156
  "question": "Which of the following methods to combat aging skin are you most interested in?",
157
+ "options": {"1. Reduce Wrinkles without Makeup.": ["Ad 1: Stop Covering Your Wrinkles with Make Up - Do This Instead."], "2. Retinol Alternatives for Wrinkle Treatment." : ["Ad 2: Forget Retinol, Use This Household Item To Fill In Wrinkles - Celebrities Are Ditching Pricey Facelifts For This."], "3. Information on Skin Diseases": ["Ad 4: What is Advanced Skin Cancer? - Find Disease Information Here.Find Facts About Advanced Skin Cancer and a Potential Treatment Option.", "Ad 5: Learn About Advanced Melanoma - Find Disease Information Here.Find Facts About Advanced Melanoma and a Potential Treatment Option.", "Ad 7: Are You Living With Vitiligo? - For Patients & Caregivers.Discover An FDA-Approved Topical Cream That May Help With Nonsegmental Vitiligo Repigmentation. Learn About A Copay Savings Card For Eligible Patients With Vitiligo."]}
158
  }
159
  -----------------------------------------------
160
 
 
173
  -----------------------------------------------
174
 
175
  The ADS_DATA provided to you is as follows:
176
+ """
177
+
178
+ old_system_prompt_additional_example = """
179
+ -----------------------------------------------
180
+ <Sample INPUT(PAGE_TITLE)>
181
+ 7 Signs and Symptoms of Magnesium Deficiency
182
+
183
+ <Sample ADS_DATA>
184
+ Ad 1: 4 Warning Signs Of Dementia - Fight Dementia and Memory Loss. 100% Natural Program To Prevent Cognitive Decline. Developed By Dr. Will Mitchell. Read The Reviews-Get a Special Offer. Doctor Recommended. High Quality Standards. 60-Day Refund.
185
+ Ad 2: About Hyperkalemia - Learn About The Symptoms. High Potassium Can Be A Serious Condition. Learn More About Hyperkalemia Today.
186
+ Ad 3: Weak or Paralyzed Muscles? - A Common Symptom of Cataplexy. About 70% of People With Narcolepsy Are Believed to Have Cataplexy Symptoms. Learn More. Download the Doctor Discussion Guide to Have a Informed Conversation About Your Health.
187
+
188
+ <Expected json output>
189
+ {
190
+ "reasoning" : "Given the input '7 Signs and Symptoms of Magnesium Deficiency,' it is evident that the user is looking for information specifically about magnesium deficiency. Ads 1, 2, and 3 discuss topics such as dementia, hyperkalemia, weak muscles, which are not related to magnesium deficiency in any way. Therefore, all the ads in the ADS_DATA are not suitable for the user's query and will be discarded.",
191
+ "question": "No related ads available to form question and options.",
192
+ "options": []
193
+ }
194
+ ------------------------------------------------
195
+ """
196
+
197
+ def callOpenAiApi(self, messages):
198
+ while True:
199
+ try:
200
+ response = self.client.chat.completions.create(
201
+ model=self.qa_model_name,
202
+ messages=messages,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  temperature=0,
204
  seed=42,
205
  max_tokens=1000,
206
  response_format={"type": "json_object"},
207
  )
208
+ tokens_used = response.usage.total_tokens
209
+ answer = json.loads(response.choices[0].message.content)
210
+ return answer, tokens_used
211
+ except Exception as e:
212
+ print("Error-: ", e.message)
213
+ print("Trying Again")
214
+
215
+ def getBestQuestionOnTheBasisOfPageInformationAndAdsData(
216
+ self,
217
+ page_information,
218
+ adsData,
219
+ relationSystemPrompt,
220
+ questionSystemPrompt,
221
+ bestRetreivedAdValue,
222
+ ):
223
+ if adsData == "":
224
+ return ({"reasoning": "No ads data present", "classification": 0}, 0), (
225
+ {"reasoning": "", "question": "", "options": []},
226
+ 0,
227
+ )
 
 
 
228
 
229
+ relation_answer = {"reasoning": "", "classification": 1}
230
+ question_answer = {"reasoning": "", "question": "", "options": []}
231
+ tokens_used_relation = 0
232
+ tokens_used_question = 0
233
+ if bestRetreivedAdValue > self.relation_check_best_value_thresh:
234
+ relation_answer, tokens_used_relation = self.callOpenAiApi(
235
+ [
236
+ {
237
+ "role": "system",
238
+ "content": relationSystemPrompt + adsData,
239
+ }
240
+ ]
241
+ + [
242
+ {
243
+ "role": "user",
244
+ "content": page_information + "\nThe JSON response: ",
245
+ }
246
+ ]
247
+ )
248
 
249
+ tokens_used_question = 0
250
+ else:
251
+ relation_answer["reasoning"] = (
252
+ "First retreived document value less than threshold so no need to check relation"
253
+ )
254
 
255
+ if relation_answer["classification"] != 0:
256
+ question_answer, tokens_used_question = self.callOpenAiApi([
257
+ {
258
+ "role": "system",
259
+ "content": questionSystemPrompt + adsData,
260
+ }
 
 
 
 
 
 
 
 
 
 
 
261
  ]
262
+ + [
263
+ {
264
+ "role": "user",
265
+ "content": page_information + "\nThe JSON response: ",
266
+ }
267
+ ])
268
+ return (relation_answer, tokens_used_relation), (
269
+ question_answer,
270
+ tokens_used_question,
271
  )
272
+
273
+ def convertDocumentsClustersToStringForApiCall(self, documents_clusters):
274
+ key_counter = count(1)
275
+ res = json.dumps([
276
+ {f"Ad {next(key_counter)}": document[0].page_content for j, document in enumerate(documents_cluster)}
277
+ for i, documents_cluster in enumerate(documents_clusters)
278
+ ], indent=4)
279
+ return res
280
+
281
+ def changeDocumentsToPrintableString(self, documents_clusters):
282
+ res = ""
283
+ i = 0
284
+ for ind, documents_cluster in enumerate(documents_clusters):
285
+ res += f"Cluster {ind+1}-:\n"
286
+ for document in documents_cluster:
287
+ i += 1
288
+ res += f"[Ad {i}] Content: {document[0].page_content}\nRevenue: {document[0].metadata['revenue']}\nAd Click Count: {document[0].metadata['ad_click_count']}\nValue: {document[1]}\n"
289
+ res += "\n"
290
+ return res
291
+
292
+ def changeResponseToPrintableString(self, response, task):
293
+ if task == "relation":
294
+ return f"Reasoning: {response['reasoning']}\n\nClassification: {response['classification']}\n"
295
+ res = f"Reasoning: {response['reasoning']}\n\nQuestion: {response['question']}\n\nOptions: \n"
296
+ for option in response["options"]:
297
+ res += f"{option}\n"
298
+ for ad in response["options"][option]:
299
+ res += f"{ad}\n"
300
+ res += "\n"
301
+ return res
302
+
303
+ def logResult(self, curr_relation_prompt, curr_question_prompt, page_information, relation_answer, question_answer):
304
+ print("----------------------------------------------------------------------------------------------------------------------------------------------------------------\n", curr_relation_prompt, curr_question_prompt, page_information, json.dumps(relation_answer, indent=4), json.dumps(question_answer, indent=4), "\n----------------------------------------------------------------------------------------------------------------------------------------------------------------\n\n")
305
+
306
+ def getRagResponse(
307
+ self, RelationPrompt, QuestionPrompt, threshold, page_information
308
+ ):
309
+ curr_relation_prompt = self.bestRelationSystemPrompt
310
+ if RelationPrompt != None or len(RelationPrompt):
311
+ curr_relation_prompt = RelationPrompt
312
+
313
+ curr_question_prompt = self.bestQuestionSystemPrompt
314
+ if QuestionPrompt != None or len(QuestionPrompt):
315
+ curr_question_prompt = QuestionPrompt
316
+
317
+ documents_clusters, best_value = self.db.queryVectorDB(page_information, threshold)
318
+ relation_answer, question_answer = (
319
+ self.getBestQuestionOnTheBasisOfPageInformationAndAdsData(
320
+ page_information,
321
+ self.convertDocumentsClustersToStringForApiCall(documents_clusters),
322
+ curr_relation_prompt,
323
+ curr_question_prompt,
324
+ best_value,
325
+ )
326
+ )
327
+ self.logResult(curr_relation_prompt, curr_relation_prompt, page_information, relation_answer, question_answer)
328
+
329
+ docs_info = self.changeDocumentsToPrintableString(documents_clusters)
330
+ relation_answer_string = self.changeResponseToPrintableString(
331
  relation_answer[0], "relation"
332
  )
333
+ question_answer_string = self.changeResponseToPrintableString(
334
  question_answer[0], "question"
335
  )
336
+ full_response = f"**ANSWER**: \n Relation answer:\n {relation_answer_string}\n Question answer:\n {question_answer_string}\n\n**RETREIVED DOCUMENTS CLUSTERS**:\n{docs_info}\n\n**TOKENS USED**:\nQuestion api call: {question_answer[1]}\nRelation api call: {relation_answer[1]}"
337
+ return full_response
 
 
338
 
339
 
 
 
 
340
  data = pd.read_csv(data_file_path, sep="\t")
341
  # data.dropna(axis=0, how="any", inplace=True)
342
  data.drop_duplicates(subset=["ad_title", "ad_desc"], inplace=True)
343
  ad_title_content = list(data["ad_title"].values)
344
+ rag = ADS_RAG()
345
  with gr.Blocks() as demo:
346
  gr.Markdown("# RAG on ads data")
347
  with gr.Row():
348
  RelationPrompt = gr.Textbox(
349
+ rag.bestRelationSystemPrompt,
350
  lines=1,
351
  placeholder="Enter the relation system prompt for relation check",
352
  label="Relation System prompt",
353
  )
354
  QuestionPrompt = gr.Textbox(
355
+ rag.bestQuestionSystemPrompt,
356
  lines=1,
357
  placeholder="Enter the question system prompt for question formulation",
358
  label="Question System prompt",
 
361
  lines=1, placeholder="Enter the page information", label="Page Information"
362
  )
363
  threshold = gr.Number(
364
+ value=rag.db.default_threshold, label="Threshold", interactive=True
365
  )
366
  output = gr.Textbox(label="Output")
367
  submit_btn = gr.Button("Submit")
368
 
369
  submit_btn.click(
370
+ rag.getRagResponse,
371
  inputs=[RelationPrompt, QuestionPrompt, threshold, page_information],
372
  outputs=[output],
373
  )
374
  page_information.submit(
375
+ rag.getRagResponse,
376
  inputs=[RelationPrompt, QuestionPrompt, threshold, page_information],
377
  outputs=[output],
378
  )
requirements.txt CHANGED
@@ -4,4 +4,9 @@ langchain
4
  langchain-community
5
  langchain-openai
6
  faiss-cpu
7
- sentence-transformers
 
 
 
 
 
 
4
  langchain-community
5
  langchain-openai
6
  faiss-cpu
7
+ sentence-transformers
8
+ scikit-learn
9
+ scipy
10
+ numpy
11
+ pandas
12
+ openai
vectorDbAdsGetterForCSV.ipynb ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/Users/lazyghost/VirtualEnvironments/langchain-rag-venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n",
14
+ "/Users/lazyghost/VirtualEnvironments/langchain-rag-venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
15
+ " warnings.warn(\n"
16
+ ]
17
+ }
18
+ ],
19
+ "source": [
20
+ "from langchain_community.vectorstores import FAISS\n",
21
+ "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
22
+ "import numpy as np\n",
23
+ "import os\n",
24
+ "\n",
25
+ "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
26
+ "\n",
27
+ "DB_FAISS_PATH = \"./vectorstore/db_faiss_ads_20May_20Jun_webmd_healthline_Health_dupRemoved0.8\"\n",
28
+ "embedding_model_hf = \"BAAI/bge-m3\"\n",
29
+ "embeddings_hf = HuggingFaceEmbeddings(model_name=embedding_model_hf)\n",
30
+ "db = FAISS.load_local(\n",
31
+ " DB_FAISS_PATH, embeddings_hf, allow_dangerous_deserialization=True\n",
32
+ ")"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 2,
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "api_data = \"\"\"\n",
42
+ "Page Title -: Shingles Symptoms, Causes, and Treatments\n",
43
+ "Page Content -: Slideshow: A Visual Guide to Shingles. Medically Reviewed by Jabeen Begum, MD on March 05, 2024. What is Shingles? 1 /18. If you've ever had the chickenpox -- and almost all adults have or have at least been exposed to it-- there's a good chance the virus is still at large in your body. The varicella zoster virus can lie dormant for decades without causing any symptoms. In some people, the virus wakes up and travels along nerve fibers to the skin. The result is a distinctive, painful rash called shingles\n",
44
+ "\"\"\"\n",
45
+ "api_data = \"\"\"\n",
46
+ "Page Title -: Best and Worst Snacks\n",
47
+ "Page Content -: 5 Snacks to Enjoy (and 5 to Avoid). Written by Amy Capetta. We all snack. But some snacks are better than others, especially if you’re managing type 2 diabetes or obesity. An ideal snack gives you protein or fiber -- or both -- to help you feel full, says Gillian Culbertson, RD, certified diabetes educator at the Cleveland Clinic. It should give you plenty of energy without too many calories. Aim for between 100 and 150 calories for women, and about 200 calories for men, with 15 to 20 grams of protein\n",
48
+ "\"\"\"\n",
49
+ "api_data = \"\"\"\n",
50
+ "Page Title -: End-Stage COPD (Stage IV)\n",
51
+ "Page Content -: End-Stage COPD (Stage IV). Medically Reviewed by Zilpah Sheikh, MD on November 13, 2023. Written by Alyson Powell Key , William Moore. What Is End-Stage COPD? End-stage, or stage IV, COPD is the final stage of chronic obstructive pulmonary disease. Most people reach it after years of living with the disease and the lung damage it causes. As a result, your quality of life is low. You’ll have exacerbations, or flares, often – one of which could be fatal\n",
52
+ "\"\"\"\n",
53
+ "api_data = \"\"\"\n",
54
+ "Page Title -: How to Lower Your A1c Level\n",
55
+ "Page Content -: English. How to Lower Your A1c Level. Medically Reviewed by Brunilda Nazario, MD on January 18, 2024. Written by Elizabeth Svoboda. When you have diabetes , you probably know you should check your blood sugar regularly. Your doctor will also recommend that you take an A1c blood test a few times a year, with a goal of lowering the results to help protect your health. And there’s a lot you can do to move toward meeting that goal\n",
56
+ "\"\"\"\n",
57
+ "ads = db.similarity_search_with_score(api_data, k = 100)\n"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "markdown",
62
+ "metadata": {},
63
+ "source": [
64
+ "# KMEANS CLUSTERING"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 12,
70
+ "metadata": {},
71
+ "outputs": [
72
+ {
73
+ "name": "stdout",
74
+ "output_type": "stream",
75
+ "text": [
76
+ "0\n",
77
+ "8 =========> HOW TO LOWER A1C LEVELS INSTANTLY - Find HOW TO LOWER A1C LEVELS INSTANTLY 0.5105679\n",
78
+ "68 =========> WHAT FOODS LOWER A1C QUICKLY - Find WHAT FOODS LOWER A1C QUICKLY 0.65786356\n",
79
+ "27 =========> lower a1c levels naturally - Browse and Discover 0.5767728\n",
80
+ "37 =========> Best Snack to Help Lower A1C Levels - See Results For best snack to help lower a1c levels 0.60074186\n",
81
+ "\n",
82
+ "1\n",
83
+ "16 =========> Lower Blood Sugar Naturally - High Blood Glucose Symptoms 0.55070686\n",
84
+ "35 =========> How To Lower Blood Sugar Immediately - Type 2 Diabetes Diet 0.59327304\n",
85
+ "11 =========> Blood Sugar & Type 2 Diabetes - Get Helpful Tips & Resources 0.52739334\n",
86
+ "36 =========> Bring Down Blood Glucose Fast - Bring down Blood Sugar 0.5967244\n",
87
+ "17 =========> Reduce Your Blood Sugar Levels - Lower Blood Sugar At Home 0.55285\n",
88
+ "\n",
89
+ "2\n",
90
+ "6 =========> Low, Normal, High A1c Ranges - Blood Glucose Chart 0.47047788\n",
91
+ "1 =========> Learn More About Managing A1C - Help Reduce Your Risk 0.3698194\n",
92
+ "0 =========> A1C Levels - Help Reduce Your Risk 0.36521938\n",
93
+ "2 =========> A1C-Related Questions? - Learn More About Managing A1C 0.37610498\n",
94
+ "10 =========> Blood Glucose Chart - What is Normal A1C? 0.52313244\n",
95
+ "\n"
96
+ ]
97
+ }
98
+ ],
99
+ "source": [
100
+ "from sklearn.cluster import KMeans\n",
101
+ "from scipy.spatial.distance import euclidean\n",
102
+ "import re\n",
103
+ "import numpy as np\n",
104
+ "\n",
105
+ "\n",
106
+ "def remove_html_tags(text):\n",
107
+ " clean = re.compile('<.*?>')\n",
108
+ " return re.sub(clean, '', text)\n",
109
+ "\n",
110
+ "no_of_clusters = 3\n",
111
+ "no_of_points = 5\n",
112
+ "kmeans = KMeans(n_clusters=no_of_clusters, random_state=42)\n",
113
+ "embeddings = np.array(embeddings_hf.embed_documents([remove_html_tags(doc[0].page_content) for doc in ads]))\n",
114
+ "kmeans.fit(embeddings)\n",
115
+ "cluster_centers = kmeans.cluster_centers_\n",
116
+ "labels = kmeans.labels_\n",
117
+ "closest_indices = [[] for _ in range(no_of_clusters)]\n",
118
+ "for i, embedding in enumerate(embeddings):\n",
119
+ " cluster_idx = labels[i]\n",
120
+ " center = cluster_centers[cluster_idx]\n",
121
+ " dist = euclidean(embedding, center)\n",
122
+ " closest_indices[cluster_idx].append((i, dist))\n",
123
+ "for i in range(no_of_clusters):\n",
124
+ " closest_indices[i].sort(key=lambda x: x[1])\n",
125
+ "selected_indices = [closest_indices[i][:no_of_points] for i in range(no_of_clusters)]\n",
126
+ "\n",
127
+ "for ind, cluster in enumerate(selected_indices):\n",
128
+ " print(f\"{ind}\")\n",
129
+ " for cluster_point in cluster:\n",
130
+ " doc_ind = cluster_point[0]\n",
131
+ " print(f\"{doc_ind} =========> \", remove_html_tags(ads[doc_ind][0].page_content.split(\".\")[0]), ads[doc_ind][1])\n",
132
+ " print()\n",
133
+ "\n",
134
+ "# starting_indexes = [[] for _ in range(no_of_clusters)]\n",
135
+ "# for i, label in enumerate(labels):\n",
136
+ "# if len(starting_indexes[label]) < no_of_points:\n",
137
+ "# starting_indexes[label].append(i)\n",
138
+ "# if all(len(cluster) == no_of_points for cluster in starting_indexes):\n",
139
+ "# break\n",
140
+ "\n",
141
+ "# for i, cluster in enumerate(starting_indexes):\n",
142
+ "# print(i)\n",
143
+ "# for id in cluster:\n",
144
+ "# print(f\"{id} ====> {remove_html_tags(ads[id][0].page_content.split(\".\")[0])}, {ads[id][1]}\")\n",
145
+ "# print()"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "markdown",
150
+ "metadata": {},
151
+ "source": [
152
+ "# SPECTRAL CLUSTERING"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": 11,
158
+ "metadata": {},
159
+ "outputs": [
160
+ {
161
+ "name": "stdout",
162
+ "output_type": "stream",
163
+ "text": [
164
+ "0\n",
165
+ "4 ====> Lower Blood Sugar Naturally - Get Blood Glucose in Order, 0.4132922291755676\n",
166
+ "5 ====> Lower Blood Sugar Naturally - 5 Tips to Control Blood Sugar, 0.4696233868598938\n",
167
+ "9 ====> Lower Blood Sugar Naturally - Cinnamon Tackles Diabetes, 0.5150120854377747\n",
168
+ "12 ====> What Type II's Must Avoid - Lower Blood Sugar Naturally, 0.5334694385528564\n",
169
+ "16 ====> Lower Blood Sugar Naturally - High Blood Glucose Symptoms, 0.5507068634033203\n",
170
+ "\n",
171
+ "1\n",
172
+ "1 ====> Learn More About Managing A1C - Help Reduce Your Risk, 0.36981940269470215\n",
173
+ "3 ====> Learn More About Managing A1C - A Once-Daily Treatment Option, 0.3764190971851349\n",
174
+ "11 ====> Blood Sugar & Type 2 Diabetes - Get Helpful Tips & Resources, 0.5273933410644531\n",
175
+ "14 ====> Type 2 Diabetes Treatment - Official Patient Site, 0.5485913753509521\n",
176
+ "15 ====> What Foods Lower A1c Quickly - Keep Blood Sugar in Check, 0.550499677658081\n",
177
+ "\n",
178
+ "2\n",
179
+ "0 ====> A1C Levels - Help Reduce Your Risk, 0.365219384431839\n",
180
+ "2 ====> A1C-Related Questions? - Learn More About Managing A1C, 0.3761049807071686\n",
181
+ "6 ====> Low, Normal, High A1c Ranges - Blood Glucose Chart, 0.4704778790473938\n",
182
+ "7 ====> HbA1c Blood Test | Check Your Glucose Levels, 0.5101255774497986\n",
183
+ "8 ====> HOW TO LOWER A1C LEVELS INSTANTLY - Find HOW TO LOWER A1C LEVELS INSTANTLY, 0.5105679035186768\n",
184
+ "\n"
185
+ ]
186
+ }
187
+ ],
188
+ "source": [
189
+ "from sklearn.cluster import SpectralClustering\n",
190
+ "no_of_clusters = 3\n",
191
+ "no_of_points = 5\n",
192
+ "\n",
193
+ "spectral_clustering = SpectralClustering(n_clusters=no_of_clusters, affinity='nearest_neighbors', random_state=42)\n",
194
+ "labels = spectral_clustering.fit_predict(embeddings)\n",
195
+ "\n",
196
+ "starting_indexes = [[] for _ in range(no_of_clusters)]\n",
197
+ "for i, label in enumerate(labels):\n",
198
+ " if len(starting_indexes[label]) < no_of_points:\n",
199
+ " starting_indexes[label].append(i)\n",
200
+ " if all(len(cluster) == no_of_points for cluster in starting_indexes):\n",
201
+ " break\n",
202
+ "\n",
203
+ "starting_indexes\n",
204
+ "for i, cluster in enumerate(starting_indexes):\n",
205
+ " print(i)\n",
206
+ " for id in cluster:\n",
207
+ " print(f\"{id} ====> {remove_html_tags(ads[id][0].page_content.split(\".\")[0])}, {ads[id][1]}\")\n",
208
+ " print()\n",
209
+ "# for i in range(len(embeddings)):\n",
210
+ "# cluster_idx = labels[i]\n",
211
+ "# dist = distances[i, :].sum() / distances.shape[0]\n",
212
+ "# closest_indices[cluster_idx].append((i, dist))\n",
213
+ "\n",
214
+ "# closest_indices\n",
215
+ "# # # Sort closest indices based on distance to cluster center\n",
216
+ "# for i in range(no_of_clusters):\n",
217
+ "# closest_indices[i].sort(key=lambda x: x[0])\n"
218
+ ]
219
+ }
220
+ ],
221
+ "metadata": {
222
+ "kernelspec": {
223
+ "display_name": "langchain-rag-venv",
224
+ "language": "python",
225
+ "name": "python3"
226
+ },
227
+ "language_info": {
228
+ "codemirror_mode": {
229
+ "name": "ipython",
230
+ "version": 3
231
+ },
232
+ "file_extension": ".py",
233
+ "mimetype": "text/x-python",
234
+ "name": "python",
235
+ "nbconvert_exporter": "python",
236
+ "pygments_lexer": "ipython3",
237
+ "version": "3.12.3"
238
+ }
239
+ },
240
+ "nbformat": 4,
241
+ "nbformat_minor": 2
242
+ }
vectorDbAdsGetterForCSV.py DELETED
@@ -1,34 +0,0 @@
1
-
2
- from langchain_community.vectorstores import FAISS
3
- from langchain_community.embeddings import HuggingFaceEmbeddings
4
- import pandas as pd
5
- import os
6
-
7
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
8
-
9
-
10
- DB_FAISS_PATH = "./vectorstore/db_faiss_ads"
11
- embedding_model_hf = "BAAI/bge-m3"
12
- embeddings_hf = HuggingFaceEmbeddings(model_name=embedding_model_hf)
13
- db = FAISS.load_local(
14
- DB_FAISS_PATH, embeddings_hf, allow_dangerous_deserialization=True
15
- )
16
-
17
- df = pd.read_csv("136_results_webmd_healthline_1000 - results_webmd_healthline_1000.tsv", sep = '\t')
18
- for index, row in df.iterrows():
19
- print(index, end = ',')
20
- if row['relation_classification'] == 0:
21
- api_data = row['publisher_url']
22
- retreived_documents = [
23
- doc
24
- for doc in db.similarity_search_with_score(api_data, k = 20)
25
- if doc[1] < 1.05
26
- ]
27
- docs_info = "\n\n".join(
28
- [
29
- f"Revenue: {doc[0].metadata['revenue']}\nAd Click Count: {doc[0].metadata['ad_click_count']}\nContent: {doc[0].page_content}\nValue: {doc[1]}"
30
- for doc in retreived_documents
31
- ]
32
- )
33
- df.at[index, 'ads_data'] = docs_info
34
- df.to_csv("136_results_webmd_healthline_1000.tsv", sep='\t')