Spaces:

MediaNetAdsRag
/

Ads_Rag

Sleeping

App Files Files Community

Rajat.bans commited on 24 days ago

Commit

ead6614

•

1 Parent(s): 09c3be4

Upgraded ads rag

Browse files

Files changed (4) hide show

rag.py +262 -180
requirements.txt +6 -1
vectorDbAdsGetterForCSV.ipynb +242 -0
vectorDbAdsGetterForCSV.py +0 -34

rag.py CHANGED Viewed

@@ -7,19 +7,110 @@ import random
 import pandas as pd
 import os
 import json
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
 load_dotenv(override=True)
-client = OpenAI()
-DB_FAISS_PATH = "./vectorstore/db_faiss_ads_20May_20Jun_webmd_healthline_Health_dupRemoved0.8"
 data_file_path = "./data/142_adclick_20May_20Jun_webmd_healthline_Health_dupRemoved0.8_someAdsCampaign.tsv"
 embedding_model_hf = "BAAI/bge-m3"
-qa_model_name = "gpt-3.5-turbo"
-default_threshold = 0.75
-relation_check_best_value_thresh = 0.6
-number_of_ads_to_fetch_from_db = 15
-bestRelationSystemPrompt = """You are an advertising concierge for text ads on websites. Given an INPUT(PAGE_TITLE) and the available ad inventory (ADS_DATA), your task is to determine whether there are some relevant ADS to INPUT are present in ADS_DATA. ADS WHICH DON'T MATCH USER'S INTENT SHOULD BE CONSIDERED IRRELEVANT
 ---------------------------------------
@@ -43,12 +134,12 @@ Expected json output :
 The ADS_DATA provided to you is as follows:
-"""
-bestQuestionSystemPrompt = """1. You are an advertising concierge for text ads on websites. Given an INPUT(PAGE_TITLE)  and the available ad inventory (ADS_DATA), your task is to form a relevant QUESTION to ask the user visiting the webpage. This question should help identify the user's intent behind visiting the webpage.
-2. From the ADS_DATA, discard all ads that are not related to the INPUT or do not match the user's intent behind visiting the page. Also, remove any ads that are distantly related to the user's intent.
-3. FROM REMAINING ADS ONLY, group together those that are similar in type. For each grouped ads form an OPTION which should be both the answer for the QUESTION and related to ads in this group.
-4. Try to generate QUESTION within 70 characters and keep either 2, 3 or 4 number of OPTIONS.
 5. Provide your REASONING behind choosing the QUESTION and the OPTIONS. Now provide the QUESTION and the OPTIONS. Along with each OPTION, provide the ads from ADS_DATA that you associated with it.
 ---------------------------------------
@@ -57,19 +148,13 @@ bestQuestionSystemPrompt = """1. You are an advertising concierge for text ads o
 The Effects of Aging on Skin
 <Sample ADS_DATA>
-Ad 1: Forget Retinol, Use This Household Item To Fill In Wrinkles - Celebrities Are Ditching Pricey Facelifts For This.
-Ad 2: Stop Covering Your Wrinkles with Make Up - Do This Instead.
-Ad 3: Living With Migraines? - Discover A Treatment Option. Learn about a type of prescription migraine treatment called CGRP receptor antagonists. Discover a range of resources that may help people dealing with migraines.
-Ad 4: What is Advanced Skin Cancer? - Find Disease Information Here.Find Facts About Advanced Skin Cancer and a Potential Treatment Option.
-Ad 5: Learn About Advanced Melanoma - Find Disease Information Here.Find Facts About Advanced Melanoma and a Potential Treatment Option.
-Ad 6: Treatment For CKD - Reduce Risk Of Progressing CKD. Ask About A Treatment That Can Help Reduce Your Risk Of Kidney Failure.
-Ad 7: Are You Living With Vitiligo? - For Patients & Caregivers.Discover An FDA-Approved Topical Cream That May Help With Nonsegmental Vitiligo Repigmentation. Learn About A Copay Savings Card For Eligible Patients With Vitiligo.
 <Expected json output>
 {
-"reasoning" : "Among the seven ads in **Sample ADS_DATA**, Ads 3 and 6 are irrelevant to the INPUT, so they should be discarded. Ad 1, 2 closely aligns with the user's intent and are mutually exclusive, so they can be presented as two separate options. Ads 4, 5, and 7 can be grouped into a single option, since they are similar and also relevant to INPUT. The question will be formed in a way to connect the PAGE TITLE content with the goals of these five relevant ads, making sure they appeal to both specific and general user interests.",
 "question": "Which of the following methods to combat aging skin are you most interested in?",
-"options": {"1. Retinol Alternatives for Wrinkle Treatment." : ["Ad 2: Forget Retinol, Use This Household Item To Fill In Wrinkles - Celebrities Are Ditching Pricey Facelifts For This."], "2. Reduce Wrinkles without Makeup.": ["Ad 1: Stop Covering Your Wrinkles with Make Up - Do This Instead."], "3. Information on Skin Diseases": ["Ad 7: Are You Living With Vitiligo? - For Patients & Caregivers.Discover An FDA-Approved Topical Cream That May Help With Nonsegmental Vitiligo Repigmentation. Learn About A Copay Savings Card For Eligible Patients With Vitiligo.", "Ad 4: What is Advanced Skin Cancer? - Find Disease Information Here.Find Facts About Advanced Skin Cancer and a Potential Treatment Option.", "Ad 5: Learn About Advanced Melanoma - Find Disease Information Here.Find Facts About Advanced Melanoma and a Potential Treatment Option."]}
 }
 -----------------------------------------------
@@ -88,189 +173,186 @@ Got A Rosemary Bush? Here’re 20 Brilliant & Unusual Ways To Use All That Rosem
 -----------------------------------------------
 The ADS_DATA provided to you is as follows:
-"""
-old_system_prompt_additional_example = """
------------------------------------------------
-<Sample INPUT(PAGE_TITLE)>
-7 Signs and Symptoms of Magnesium Deficiency
-<Sample ADS_DATA>
-Ad 1: 4 Warning Signs Of Dementia - Fight Dementia and Memory Loss. 100% Natural Program To Prevent Cognitive Decline. Developed By Dr. Will Mitchell. Read The Reviews-Get a Special Offer. Doctor Recommended. High Quality Standards. 60-Day Refund.
-Ad 2: About Hyperkalemia - Learn About The Symptoms. High Potassium Can Be A Serious Condition. Learn More About Hyperkalemia Today.
-Ad 3: Weak or Paralyzed Muscles? - A Common Symptom of Cataplexy. About 70% of People With Narcolepsy Are Believed to Have Cataplexy Symptoms. Learn More. Download the Doctor Discussion Guide to Have a Informed Conversation About Your Health.
-<Expected json output>
-{
-"reasoning" : "Given the input '7 Signs and Symptoms of Magnesium Deficiency,' it is evident that the user is looking for information specifically about magnesium deficiency. Ads 1, 2, and 3 discuss topics such as dementia, hyperkalemia, weak muscles, which are not related to magnesium deficiency in any way. Therefore, all the ads in the ADS_DATA are not suitable for the user's query and will be discarded.",
-"question": "No related ads available to form question and options.",
-"options": []
-}
-------------------------------------------------
-"""
-embeddings_hf = HuggingFaceEmbeddings(model_name=embedding_model_hf)
-def getBestQuestionOnTheBasisOfPageInformationAndAdsData(
-    page_information,
-    adsData,
-    relationSystemPrompt,
-    questionSystemPrompt,
-    bestRetreivedAdValue,
-):
-    if adsData == "":
-        return ({"reasoning": "No ads data present", "classification": 0}, 0), (
-            {"reasoning": "", "question": "", "options": []},
-            0,
-        )
-    relation_answer = {"reasoning": "", "classification": 1}
-    question_answer = {"reasoning": "", "question": "", "options": []}
-    tokens_used_relation = 0
-    tokens_used_question = 0
-    while True:
-        try:
-            if bestRetreivedAdValue > relation_check_best_value_thresh:
-                system_message = {
-                    "role": "system",
-                    "content": relationSystemPrompt + adsData,
-                }
-                response = client.chat.completions.create(
-                    model=qa_model_name,
-                    messages=[system_message]
-                    + [
-                        {
-                            "role": "user",
-                            "content": page_information + "\nThe JSON response: ",
-                        }
-                    ],
-                    temperature=0,
-                    seed=42,
-                    max_tokens=1000,
-                    response_format={"type": "json_object"},
-                )
-                tokens_used_relation = response.usage.total_tokens
-                relation_answer = json.loads(response.choices[0].message.content)
-                tokens_used_question = 0
-            else:
-                relation_answer["reasoning"] = "First retreived document value less than threshold so no need to check relation"
-            if relation_answer["classification"] != 0:
-                system_message = {
-                    "role": "system",
-                    "content": questionSystemPrompt + adsData,
-                }
-                response = client.chat.completions.create(
-                    model=qa_model_name,
-                    messages=[system_message]
-                    + [
-                        {
-                            "role": "user",
-                            "content": page_information + "\nThe JSON response: ",
-                        }
-                    ],
                     temperature=0,
                     seed=42,
                     max_tokens=1000,
                     response_format={"type": "json_object"},
                 )
-                tokens_used_question = response.usage.total_tokens
-                question_answer = json.loads(response.choices[0].message.content)
-            break
-        except Exception as e:
-            print("Error-: ", e.message)
-            print("Trying Again")
-    return (relation_answer, tokens_used_relation), (
-        question_answer,
-        tokens_used_question,
-    )
-def changeResponseToPrintableString(response, task):
-    if task == "relation":
-        return f"Reasoning: {response['reasoning']}\n\nClassification: {response['classification']}\n"
-    res = f"Reasoning: {response['reasoning']}\n\nQuestion: {response['question']}\n\nOptions: \n"
-    for option in response["options"]:
-        res += f"{option}\n"
-        for ad in response["options"][option]:
-            res += f"{ad}\n"
-        res += "\n"
-    return res
-def getRagResponse(RelationPrompt, QuestionPrompt, threshold, page_information):
-    curr_relation_prompt = bestRelationSystemPrompt
-    if RelationPrompt != None or len(RelationPrompt):
-        curr_relation_prompt = RelationPrompt
-    curr_question_prompt = bestQuestionSystemPrompt
-    if QuestionPrompt != None or len(QuestionPrompt):
-        curr_question_prompt = QuestionPrompt
-    retreived_documents = [
-        doc
-        for doc in db.similarity_search_with_score(
-            page_information, k=number_of_ads_to_fetch_from_db
-        )
-        if doc[1] < threshold
-    ]
-    best_value = 1
-    if len(retreived_documents):
-        best_value = retreived_documents[0][1]
-    relation_answer, question_answer = (
-        getBestQuestionOnTheBasisOfPageInformationAndAdsData(
-            page_information,
-            ".\n".join(
-                [
-                    "Ad " + str(i + 1) + ". " + doc[0].page_content
-                    for i, doc in enumerate(retreived_documents)
                 ]
-            ),
-            curr_relation_prompt,
-            curr_question_prompt,
-            best_value,
         )
-    )
-    print("QUERY:", page_information, relation_answer, question_answer)
-    docs_info = "\n\n".join(
-        [
-            # f"Publisher url: {doc[0].metadata['publisher_url']}\nKeyword Term: {doc[0].metadata['keyword_term']}\nAd Display Url: {doc[0].metadata['ad_display_url']}\nRevenue: {doc[0].metadata['revenue']}\nAd Click Count: {doc[0].metadata['ad_click_count']}\nContent: {doc[0].page_content}\nValue: {doc[1]}"
-            f"{i+1}. Content: {doc[0].page_content}\nRevenue: {doc[0].metadata['revenue']}\nAd Click Count: {doc[0].metadata['ad_click_count']}\nValue: {doc[1]}"
-            for i, doc in enumerate(retreived_documents)
-        ]
-    )
-    try:
-        relation_answer_string = changeResponseToPrintableString(
             relation_answer[0], "relation"
         )
-        question_answer_string = changeResponseToPrintableString(
             question_answer[0], "question"
         )
-        full_response = f"**ANSWER**: \n Relation answer:\n {relation_answer_string}\n Question answer:\n {question_answer_string}\n\n**RETREIVED DOCUMENTS**:\n{docs_info}\n\n**TOKENS USED**:\nQuestion api call: {question_answer[1]}\nRelation api call: {relation_answer[1]}"
-    except:
-        full_response = f"Invalid response received"
-    return full_response
-db = FAISS.load_local(
-    DB_FAISS_PATH, embeddings_hf, allow_dangerous_deserialization=True
-)
 data = pd.read_csv(data_file_path, sep="\t")
 # data.dropna(axis=0, how="any", inplace=True)
 data.drop_duplicates(subset=["ad_title", "ad_desc"], inplace=True)
 ad_title_content = list(data["ad_title"].values)
 with gr.Blocks() as demo:
     gr.Markdown("# RAG on ads data")
     with gr.Row():
         RelationPrompt = gr.Textbox(
-            bestRelationSystemPrompt,
             lines=1,
             placeholder="Enter the relation system prompt for relation check",
             label="Relation System prompt",
         )
         QuestionPrompt = gr.Textbox(
-            bestQuestionSystemPrompt,
             lines=1,
             placeholder="Enter the question system prompt for question formulation",
             label="Question System prompt",
@@ -279,18 +361,18 @@ with gr.Blocks() as demo:
             lines=1, placeholder="Enter the page information", label="Page Information"
         )
         threshold = gr.Number(
-            value=default_threshold, label="Threshold", interactive=True
         )
     output = gr.Textbox(label="Output")
     submit_btn = gr.Button("Submit")
     submit_btn.click(
-        getRagResponse,
         inputs=[RelationPrompt, QuestionPrompt, threshold, page_information],
         outputs=[output],
     )
     page_information.submit(
-        getRagResponse,
         inputs=[RelationPrompt, QuestionPrompt, threshold, page_information],
         outputs=[output],
     )

 import pandas as pd
 import os
 import json
+from sklearn.cluster import KMeans, SpectralClustering
+from scipy.spatial.distance import euclidean
+import re
+import numpy as np
+from itertools import count
 load_dotenv(override=True)
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 data_file_path = "./data/142_adclick_20May_20Jun_webmd_healthline_Health_dupRemoved0.8_someAdsCampaign.tsv"
 embedding_model_hf = "BAAI/bge-m3"
+embeddings_hf = HuggingFaceEmbeddings(model_name=embedding_model_hf)
+class CLUSTERING:
+    def __init__(self):
+        self.clustering_algo = 'kmeans-cc' # ['kmeans-cc', 'kmeans-sp', 'spectral_clustering']
+    def cluster_embeddings(self, embeddings, no_of_clusters, no_of_points):
+        if self.clustering_algo in {"kmeans-cc", "kmeans-sp"}:
+            kmeans = KMeans(n_clusters=no_of_clusters, random_state=42)
+            kmeans.fit(embeddings)
+            cluster_centers = kmeans.cluster_centers_
+            labels = kmeans.labels_
+            if self.clustering_algo == "kmeans-cc":
+                clusters_indices = [[] for _ in range(no_of_clusters)]
+                for i, embedding in enumerate(embeddings):
+                    cluster_idx = labels[i]
+                    center = cluster_centers[cluster_idx]
+                    dist = euclidean(embedding, center)
+                    clusters_indices[cluster_idx].append((i, dist))
+                for i in range(no_of_clusters):
+                    clusters_indices[i].sort(key=lambda x: x[1])
+            else:
+                clusters_indices = [[] for _ in range(no_of_clusters)]
+                for i, label in enumerate(labels):
+                    if len(clusters_indices[label]) < no_of_points:
+                        clusters_indices[label].append(i)
+                    if all(len(cluster) == no_of_points for cluster in clusters_indices):
+                        break
+        elif self.clustering_algo == "spectral":
+            spectral_clustering = SpectralClustering(n_clusters=no_of_clusters, affinity='nearest_neighbors', random_state=42)
+            labels = spectral_clustering.fit_predict(embeddings)
+            clusters_indices = [[] for _ in range(no_of_clusters)]
+            for i, label in enumerate(labels):
+                if len(clusters_indices[label]) < no_of_points:
+                    clusters_indices[label].append(i)
+                if all(len(cluster) == no_of_points for cluster in clusters_indices):
+                    break
+        return [
+            [cluster_point[0] for cluster_point in clusters_indices[i][:no_of_points]]
+            for i in range(no_of_clusters)
+        ]
+class VECTOR_DB:
+    def __init__(self):
+        self.DB_FAISS_PATH = "./vectorstore/db_faiss_ads_20May_20Jun_webmd_healthline_Health_dupRemoved0.8"
+        self.default_threshold = 0.75
+        self.number_of_ads_to_fetch_from_db = 50
+        self.no_of_clusters = 3
+        self.no_of_ads_in_each_cluster = 6
+        self.db = FAISS.load_local(
+            self.DB_FAISS_PATH, embeddings_hf, allow_dangerous_deserialization=True
+        )
+    def queryVectorDB(self, page_information, threshold):
+        def remove_html_tags(text):
+            clean = re.compile("<.*?>")
+            return re.sub(clean, "", text)
+        retreived_documents = [
+            doc
+            for doc in self.db.similarity_search_with_score(
+                page_information, k=self.number_of_ads_to_fetch_from_db
+            )
+            if doc[1] < threshold
+        ]
+        for i in range(len(retreived_documents)):
+            retreived_documents[i][0].page_content = remove_html_tags(
+                retreived_documents[i][0].page_content
+            )
+        embeddings = np.array(embeddings_hf.embed_documents([doc[0].page_content for doc in retreived_documents]))
+        clustered_indices = CLUSTERING().cluster_embeddings(
+            embeddings, self.no_of_clusters, self.no_of_ads_in_each_cluster
+        )
+        documents_clusters = [
+            [retreived_documents[ind] for ind in cluster_indices]
+            for cluster_indices in clustered_indices
+        ]
+        best_value = 1
+        if len(retreived_documents):
+            best_value = retreived_documents[0][1]
+        return documents_clusters, best_value
+class ADS_RAG:
+    def __init__(self):
+        self.client = OpenAI()
+        self.db = VECTOR_DB()
+        self.qa_model_name = "gpt-3.5-turbo"
+        self.relation_check_best_value_thresh = 0.6
+        self.bestRelationSystemPrompt = """You are an advertising concierge for text ads on websites. Given an INPUT(PAGE_TITLE) and the available ad inventory (ADS_DATA), your task is to determine whether there are some relevant ADS to INPUT are present in ADS_DATA. ADS WHICH DON'T MATCH USER'S INTENT SHOULD BE CONSIDERED IRRELEVANT
 ---------------------------------------
 The ADS_DATA provided to you is as follows:
+        """
+        self.bestQuestionSystemPrompt = """1. You are an advertising concierge for text ads on websites. Given an INPUT(PAGE_TITLE)  and the available ad inventory (ADS_DATA), your task is to form a relevant QUESTION to ask the user visiting the webpage. This question should help identify the user's intent behind visiting the webpage.
+2. From the ADS_DATA clusters, discard all ads that are not related to the INPUT or do not match the user's intent behind visiting the page. Also, remove any ads that are distantly related to the user's intent.
+3. FROM REMAINING ADS  in each ads cluster form an OPTION which should be both the answer for the QUESTION and related to ads in this cluster.
+4. Try to generate intelligent creatives for advertising and keep QUESTION within 70 characters and each OPTION with either 4, 5, or 6 words.
 5. Provide your REASONING behind choosing the QUESTION and the OPTIONS. Now provide the QUESTION and the OPTIONS. Along with each OPTION, provide the ads from ADS_DATA that you associated with it.
 ---------------------------------------
 The Effects of Aging on Skin
 <Sample ADS_DATA>
+[{"Ad 1": "Forget Retinol, Use This Household Item To Fill In Wrinkles - Celebrities Are Ditching Pricey Facelifts For This.", "Ad 2": "Stop Covering Your Wrinkles with Make Up - Do This Instead."}, {"Ad 3": "Living With Migraines? - Discover A Treatment Option. Learn about a type of prescription migraine treatment called CGRP receptor antagonists. Discover a range of resources that may help people dealing with migraines."}, {"Ad 4": "What is Advanced Skin Cancer? - Find Disease Information Here.Find Facts About Advanced Skin Cancer and a Potential Treatment Option.", "Ad 5": "Learn About Advanced Melanoma - Find Disease Information Here.Find Facts About Advanced Melanoma and a Potential Treatment Option.", "Ad 6": "Treatment For CKD - Reduce Risk Of Progressing CKD. Ask About A Treatment That Can Help Reduce Your Risk Of Kidney Failure.", "Ad 7": "Are You Living With Vitiligo? - For Patients & Caregivers.Discover An FDA-Approved Topical Cream That May Help With Nonsegmental Vitiligo Repigmentation. Learn About A Copay Savings Card For Eligible Patients With Vitiligo."}]
 <Expected json output>
 {
+"reasoning" : "Among the seven ads in **Sample ADS_DATA**, Ads 3 and 6 are irrelevant to the INPUT, so they should be discarded. Ad 1, 2, 4, 5, and 7 are relevant to INPUT. The question will be formed in a way to connect the PAGE TITLE content with the goals of these five relevant ads, making sure they appeal to both specific and general user interests.",
 "question": "Which of the following methods to combat aging skin are you most interested in?",
+"options": {"1. Reduce Wrinkles without Makeup.": ["Ad 1: Stop Covering Your Wrinkles with Make Up - Do This Instead."], "2. Retinol Alternatives for Wrinkle Treatment." : ["Ad 2: Forget Retinol, Use This Household Item To Fill In Wrinkles - Celebrities Are Ditching Pricey Facelifts For This."], "3. Information on Skin Diseases": ["Ad 4: What is Advanced Skin Cancer? - Find Disease Information Here.Find Facts About Advanced Skin Cancer and a Potential Treatment Option.", "Ad 5: Learn About Advanced Melanoma - Find Disease Information Here.Find Facts About Advanced Melanoma and a Potential Treatment Option.", "Ad 7: Are You Living With Vitiligo? - For Patients & Caregivers.Discover An FDA-Approved Topical Cream That May Help With Nonsegmental Vitiligo Repigmentation. Learn About A Copay Savings Card For Eligible Patients With Vitiligo."]}
 }
 -----------------------------------------------
 -----------------------------------------------
 The ADS_DATA provided to you is as follows:
+        """
+        old_system_prompt_additional_example = """
+        -----------------------------------------------
+        <Sample INPUT(PAGE_TITLE)>
+        7 Signs and Symptoms of Magnesium Deficiency
+        <Sample ADS_DATA>
+        Ad 1: 4 Warning Signs Of Dementia - Fight Dementia and Memory Loss. 100% Natural Program To Prevent Cognitive Decline. Developed By Dr. Will Mitchell. Read The Reviews-Get a Special Offer. Doctor Recommended. High Quality Standards. 60-Day Refund.
+        Ad 2: About Hyperkalemia - Learn About The Symptoms. High Potassium Can Be A Serious Condition. Learn More About Hyperkalemia Today.
+        Ad 3: Weak or Paralyzed Muscles? - A Common Symptom of Cataplexy. About 70% of People With Narcolepsy Are Believed to Have Cataplexy Symptoms. Learn More. Download the Doctor Discussion Guide to Have a Informed Conversation About Your Health.
+        <Expected json output>
+        {
+        "reasoning" : "Given the input '7 Signs and Symptoms of Magnesium Deficiency,' it is evident that the user is looking for information specifically about magnesium deficiency. Ads 1, 2, and 3 discuss topics such as dementia, hyperkalemia, weak muscles, which are not related to magnesium deficiency in any way. Therefore, all the ads in the ADS_DATA are not suitable for the user's query and will be discarded.",
+        "question": "No related ads available to form question and options.",
+        "options": []
+        }
+        ------------------------------------------------
+        """
+    def callOpenAiApi(self, messages):
+        while True:
+            try:
+                response = self.client.chat.completions.create(
+                    model=self.qa_model_name,
+                    messages=messages,
                     temperature=0,
                     seed=42,
                     max_tokens=1000,
                     response_format={"type": "json_object"},
                 )
+                tokens_used = response.usage.total_tokens
+                answer = json.loads(response.choices[0].message.content)
+                return answer, tokens_used
+            except Exception as e:
+                print("Error-: ", e.message)
+                print("Trying Again")
+    def getBestQuestionOnTheBasisOfPageInformationAndAdsData(
+        self,
+        page_information,
+        adsData,
+        relationSystemPrompt,
+        questionSystemPrompt,
+        bestRetreivedAdValue,
+    ):
+        if adsData == "":
+            return ({"reasoning": "No ads data present", "classification": 0}, 0), (
+                {"reasoning": "", "question": "", "options": []},
+                0,
+            )
+        relation_answer = {"reasoning": "", "classification": 1}
+        question_answer = {"reasoning": "", "question": "", "options": []}
+        tokens_used_relation = 0
+        tokens_used_question = 0
+        if bestRetreivedAdValue > self.relation_check_best_value_thresh:
+            relation_answer, tokens_used_relation = self.callOpenAiApi(
+                [
+                    {
+                        "role": "system",
+                        "content": relationSystemPrompt + adsData,
+                    }
+                ]
+                + [
+                    {
+                        "role": "user",
+                        "content": page_information + "\nThe JSON response: ",
+                    }
+                ]
+            )
+            tokens_used_question = 0
+        else:
+            relation_answer["reasoning"] = (
+                "First retreived document value less than threshold so no need to check relation"
+            )
+        if relation_answer["classification"] != 0:
+            question_answer, tokens_used_question = self.callOpenAiApi([
+                    {
+                        "role": "system",
+                        "content": questionSystemPrompt + adsData,
+                    }
                 ]
+                + [
+                    {
+                        "role": "user",
+                        "content": page_information + "\nThe JSON response: ",
+                    }
+                ])
+        return (relation_answer, tokens_used_relation), (
+            question_answer,
+            tokens_used_question,
         )
+    def convertDocumentsClustersToStringForApiCall(self, documents_clusters):
+        key_counter = count(1)
+        res = json.dumps([
+            {f"Ad {next(key_counter)}": document[0].page_content for j, document in enumerate(documents_cluster)}
+            for i, documents_cluster in enumerate(documents_clusters)
+        ], indent=4)
+        return res
+    def changeDocumentsToPrintableString(self, documents_clusters):
+        res = ""
+        i = 0
+        for ind, documents_cluster in enumerate(documents_clusters):
+            res += f"Cluster {ind+1}-:\n"
+            for document in documents_cluster:
+                i += 1
+                res += f"[Ad {i}] Content: {document[0].page_content}\nRevenue: {document[0].metadata['revenue']}\nAd Click Count: {document[0].metadata['ad_click_count']}\nValue: {document[1]}\n"
+            res += "\n"
+        return res
+    def changeResponseToPrintableString(self, response, task):
+        if task == "relation":
+            return f"Reasoning: {response['reasoning']}\n\nClassification: {response['classification']}\n"
+        res = f"Reasoning: {response['reasoning']}\n\nQuestion: {response['question']}\n\nOptions: \n"
+        for option in response["options"]:
+            res += f"{option}\n"
+            for ad in response["options"][option]:
+                res += f"{ad}\n"
+            res += "\n"
+        return res
+    def logResult(self, curr_relation_prompt, curr_question_prompt, page_information, relation_answer, question_answer):
+        print("----------------------------------------------------------------------------------------------------------------------------------------------------------------\n", curr_relation_prompt, curr_question_prompt, page_information, json.dumps(relation_answer, indent=4), json.dumps(question_answer, indent=4), "\n----------------------------------------------------------------------------------------------------------------------------------------------------------------\n\n")
+    def getRagResponse(
+        self, RelationPrompt, QuestionPrompt, threshold, page_information
+    ):
+        curr_relation_prompt = self.bestRelationSystemPrompt
+        if RelationPrompt != None or len(RelationPrompt):
+            curr_relation_prompt = RelationPrompt
+        curr_question_prompt = self.bestQuestionSystemPrompt
+        if QuestionPrompt != None or len(QuestionPrompt):
+            curr_question_prompt = QuestionPrompt
+        documents_clusters, best_value = self.db.queryVectorDB(page_information, threshold)
+        relation_answer, question_answer = (
+            self.getBestQuestionOnTheBasisOfPageInformationAndAdsData(
+                page_information,
+                self.convertDocumentsClustersToStringForApiCall(documents_clusters),
+                curr_relation_prompt,
+                curr_question_prompt,
+                best_value,
+            )
+        )
+        self.logResult(curr_relation_prompt, curr_relation_prompt, page_information, relation_answer, question_answer)
+        docs_info = self.changeDocumentsToPrintableString(documents_clusters)
+        relation_answer_string = self.changeResponseToPrintableString(
             relation_answer[0], "relation"
         )
+        question_answer_string = self.changeResponseToPrintableString(
             question_answer[0], "question"
         )
+        full_response = f"**ANSWER**: \n Relation answer:\n {relation_answer_string}\n Question answer:\n {question_answer_string}\n\n**RETREIVED DOCUMENTS CLUSTERS**:\n{docs_info}\n\n**TOKENS USED**:\nQuestion api call: {question_answer[1]}\nRelation api call: {relation_answer[1]}"
+        return full_response
 data = pd.read_csv(data_file_path, sep="\t")
 # data.dropna(axis=0, how="any", inplace=True)
 data.drop_duplicates(subset=["ad_title", "ad_desc"], inplace=True)
 ad_title_content = list(data["ad_title"].values)
+rag = ADS_RAG()
 with gr.Blocks() as demo:
     gr.Markdown("# RAG on ads data")
     with gr.Row():
         RelationPrompt = gr.Textbox(
+            rag.bestRelationSystemPrompt,
             lines=1,
             placeholder="Enter the relation system prompt for relation check",
             label="Relation System prompt",
         )
         QuestionPrompt = gr.Textbox(
+            rag.bestQuestionSystemPrompt,
             lines=1,
             placeholder="Enter the question system prompt for question formulation",
             label="Question System prompt",
             lines=1, placeholder="Enter the page information", label="Page Information"
         )
         threshold = gr.Number(
+            value=rag.db.default_threshold, label="Threshold", interactive=True
         )
     output = gr.Textbox(label="Output")
     submit_btn = gr.Button("Submit")
     submit_btn.click(
+        rag.getRagResponse,
         inputs=[RelationPrompt, QuestionPrompt, threshold, page_information],
         outputs=[output],
     )
     page_information.submit(
+        rag.getRagResponse,
         inputs=[RelationPrompt, QuestionPrompt, threshold, page_information],
         outputs=[output],
     )

requirements.txt CHANGED Viewed

@@ -4,4 +4,9 @@ langchain
 langchain-community
 langchain-openai
 faiss-cpu
-sentence-transformers

 langchain-community
 langchain-openai
 faiss-cpu
+sentence-transformers
+scikit-learn
+scipy
+numpy
+pandas
+openai

vectorDbAdsGetterForCSV.ipynb ADDED Viewed

	@@ -0,0 +1,242 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/lazyghost/VirtualEnvironments/langchain-rag-venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "/Users/lazyghost/VirtualEnvironments/langchain-rag-venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain_community.vectorstores import FAISS\n",
+    "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
+    "import numpy as np\n",
+    "import os\n",
+    "\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+    "\n",
+    "DB_FAISS_PATH = \"./vectorstore/db_faiss_ads_20May_20Jun_webmd_healthline_Health_dupRemoved0.8\"\n",
+    "embedding_model_hf = \"BAAI/bge-m3\"\n",
+    "embeddings_hf = HuggingFaceEmbeddings(model_name=embedding_model_hf)\n",
+    "db = FAISS.load_local(\n",
+    "    DB_FAISS_PATH, embeddings_hf, allow_dangerous_deserialization=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "api_data = \"\"\"\n",
+    "Page Title -: Shingles Symptoms, Causes, and Treatments\n",
+    "Page Content -: Slideshow: A Visual Guide to Shingles. Medically Reviewed by Jabeen Begum, MD on March 05, 2024. What is Shingles? 1 /18. If you've ever had the chickenpox -- and almost all adults have or have at least been exposed to it-- there's a good chance the virus is still at large in your body. The varicella zoster virus can lie dormant for decades without causing any symptoms. In some people, the virus wakes up and travels along nerve fibers to the skin. The result is a distinctive, painful rash called shingles\n",
+    "\"\"\"\n",
+    "api_data = \"\"\"\n",
+    "Page Title -: Best and Worst Snacks\n",
+    "Page Content -: 5 Snacks to Enjoy (and 5 to Avoid). Written by Amy Capetta. We all snack. But some snacks are better than others, especially if you’re managing type 2 diabetes or obesity. An ideal snack gives you protein or fiber -- or both -- to help you feel full, says Gillian Culbertson, RD, certified diabetes educator at the Cleveland Clinic. It should give you plenty of energy without too many calories. Aim for between 100 and 150 calories for women, and about 200 calories for men, with 15 to 20 grams of protein\n",
+    "\"\"\"\n",
+    "api_data = \"\"\"\n",
+    "Page Title -: End-Stage COPD (Stage IV)\n",
+    "Page Content -: End-Stage COPD (Stage IV). Medically Reviewed by Zilpah Sheikh, MD on November 13, 2023. Written by Alyson Powell Key , William Moore. What Is End-Stage COPD? End-stage, or stage IV, COPD is the final stage of chronic obstructive pulmonary disease. Most people reach it after years of living with the disease and the lung damage it causes. As a result, your quality of life is low. You’ll have exacerbations, or flares, often – one of which could be fatal\n",
+    "\"\"\"\n",
+    "api_data = \"\"\"\n",
+    "Page Title -: How to Lower Your A1c Level\n",
+    "Page Content -: English. How to Lower Your A1c Level. Medically Reviewed by Brunilda Nazario, MD on January 18, 2024. Written by Elizabeth Svoboda. When you have diabetes , you probably know you should check your blood sugar regularly. Your doctor will also recommend that you take an A1c blood test a few times a year, with a goal of lowering the results to help protect your health. And there’s a lot you can do to move toward meeting that goal\n",
+    "\"\"\"\n",
+    "ads = db.similarity_search_with_score(api_data, k = 100)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# KMEANS CLUSTERING"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "8 =========>  HOW TO LOWER A1C LEVELS INSTANTLY - Find HOW TO LOWER A1C LEVELS INSTANTLY 0.5105679\n",
+      "68 =========>  WHAT FOODS LOWER A1C QUICKLY - Find WHAT FOODS LOWER A1C QUICKLY 0.65786356\n",
+      "27 =========>  lower a1c levels naturally - Browse and Discover 0.5767728\n",
+      "37 =========>  Best Snack to Help Lower A1C Levels - See Results For best snack to help lower a1c levels 0.60074186\n",
+      "\n",
+      "1\n",
+      "16 =========>  Lower Blood Sugar Naturally - High Blood Glucose Symptoms 0.55070686\n",
+      "35 =========>  How To Lower Blood Sugar Immediately - Type 2 Diabetes Diet 0.59327304\n",
+      "11 =========>  Blood Sugar & Type 2 Diabetes - Get Helpful Tips & Resources 0.52739334\n",
+      "36 =========>  Bring Down Blood Glucose Fast - Bring down Blood Sugar 0.5967244\n",
+      "17 =========>  Reduce Your Blood Sugar Levels - Lower Blood Sugar At Home 0.55285\n",
+      "\n",
+      "2\n",
+      "6 =========>  Low, Normal, High A1c Ranges - Blood Glucose Chart 0.47047788\n",
+      "1 =========>  Learn More About Managing A1C - Help Reduce Your Risk 0.3698194\n",
+      "0 =========>  A1C Levels - Help Reduce Your Risk 0.36521938\n",
+      "2 =========>  A1C-Related Questions? - Learn More About Managing A1C 0.37610498\n",
+      "10 =========>  Blood Glucose Chart - What is Normal A1C? 0.52313244\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.cluster import KMeans\n",
+    "from scipy.spatial.distance import euclidean\n",
+    "import re\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "def remove_html_tags(text):\n",
+    "    clean = re.compile('<.*?>')\n",
+    "    return re.sub(clean, '', text)\n",
+    "\n",
+    "no_of_clusters = 3\n",
+    "no_of_points = 5\n",
+    "kmeans = KMeans(n_clusters=no_of_clusters, random_state=42)\n",
+    "embeddings = np.array(embeddings_hf.embed_documents([remove_html_tags(doc[0].page_content) for doc in ads]))\n",
+    "kmeans.fit(embeddings)\n",
+    "cluster_centers = kmeans.cluster_centers_\n",
+    "labels = kmeans.labels_\n",
+    "closest_indices = [[] for _ in range(no_of_clusters)]\n",
+    "for i, embedding in enumerate(embeddings):\n",
+    "    cluster_idx = labels[i]\n",
+    "    center = cluster_centers[cluster_idx]\n",
+    "    dist = euclidean(embedding, center)\n",
+    "    closest_indices[cluster_idx].append((i, dist))\n",
+    "for i in range(no_of_clusters):\n",
+    "    closest_indices[i].sort(key=lambda x: x[1])\n",
+    "selected_indices = [closest_indices[i][:no_of_points] for i in range(no_of_clusters)]\n",
+    "\n",
+    "for ind, cluster in enumerate(selected_indices):\n",
+    "    print(f\"{ind}\")\n",
+    "    for cluster_point in cluster:\n",
+    "        doc_ind = cluster_point[0]\n",
+    "        print(f\"{doc_ind} =========> \", remove_html_tags(ads[doc_ind][0].page_content.split(\".\")[0]), ads[doc_ind][1])\n",
+    "    print()\n",
+    "\n",
+    "# starting_indexes = [[] for _ in range(no_of_clusters)]\n",
+    "# for i, label in enumerate(labels):\n",
+    "#     if len(starting_indexes[label]) < no_of_points:\n",
+    "#         starting_indexes[label].append(i)\n",
+    "#     if all(len(cluster) == no_of_points for cluster in starting_indexes):\n",
+    "#         break\n",
+    "\n",
+    "# for i, cluster in enumerate(starting_indexes):\n",
+    "#     print(i)\n",
+    "#     for id in cluster:\n",
+    "#         print(f\"{id} ====> {remove_html_tags(ads[id][0].page_content.split(\".\")[0])}, {ads[id][1]}\")\n",
+    "#     print()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SPECTRAL CLUSTERING"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "4 ====> Lower Blood Sugar Naturally - Get Blood Glucose in Order, 0.4132922291755676\n",
+      "5 ====> Lower Blood Sugar Naturally - 5 Tips to Control Blood Sugar, 0.4696233868598938\n",
+      "9 ====> Lower Blood Sugar Naturally - Cinnamon Tackles Diabetes, 0.5150120854377747\n",
+      "12 ====> What Type II's Must Avoid - Lower Blood Sugar Naturally, 0.5334694385528564\n",
+      "16 ====> Lower Blood Sugar Naturally - High Blood Glucose Symptoms, 0.5507068634033203\n",
+      "\n",
+      "1\n",
+      "1 ====> Learn More About Managing A1C - Help Reduce Your Risk, 0.36981940269470215\n",
+      "3 ====> Learn More About Managing A1C - A Once-Daily Treatment Option, 0.3764190971851349\n",
+      "11 ====> Blood Sugar & Type 2 Diabetes - Get Helpful Tips & Resources, 0.5273933410644531\n",
+      "14 ====> Type 2 Diabetes Treatment - Official Patient Site, 0.5485913753509521\n",
+      "15 ====> What Foods Lower A1c Quickly - Keep Blood Sugar in Check, 0.550499677658081\n",
+      "\n",
+      "2\n",
+      "0 ====> A1C Levels - Help Reduce Your Risk, 0.365219384431839\n",
+      "2 ====> A1C-Related Questions? - Learn More About Managing A1C, 0.3761049807071686\n",
+      "6 ====> Low, Normal, High A1c Ranges - Blood Glucose Chart, 0.4704778790473938\n",
+      "7 ====> HbA1c Blood Test | Check Your Glucose Levels, 0.5101255774497986\n",
+      "8 ====> HOW TO LOWER A1C LEVELS INSTANTLY - Find HOW TO LOWER A1C LEVELS INSTANTLY, 0.5105679035186768\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.cluster import SpectralClustering\n",
+    "no_of_clusters = 3\n",
+    "no_of_points = 5\n",
+    "\n",
+    "spectral_clustering = SpectralClustering(n_clusters=no_of_clusters, affinity='nearest_neighbors', random_state=42)\n",
+    "labels = spectral_clustering.fit_predict(embeddings)\n",
+    "\n",
+    "starting_indexes = [[] for _ in range(no_of_clusters)]\n",
+    "for i, label in enumerate(labels):\n",
+    "    if len(starting_indexes[label]) < no_of_points:\n",
+    "        starting_indexes[label].append(i)\n",
+    "    if all(len(cluster) == no_of_points for cluster in starting_indexes):\n",
+    "        break\n",
+    "\n",
+    "starting_indexes\n",
+    "for i, cluster in enumerate(starting_indexes):\n",
+    "    print(i)\n",
+    "    for id in cluster:\n",
+    "        print(f\"{id} ====> {remove_html_tags(ads[id][0].page_content.split(\".\")[0])}, {ads[id][1]}\")\n",
+    "    print()\n",
+    "# for i in range(len(embeddings)):\n",
+    "#     cluster_idx = labels[i]\n",
+    "#     dist = distances[i, :].sum() / distances.shape[0]\n",
+    "#     closest_indices[cluster_idx].append((i, dist))\n",
+    "\n",
+    "# closest_indices\n",
+    "# # # Sort closest indices based on distance to cluster center\n",
+    "# for i in range(no_of_clusters):\n",
+    "#     closest_indices[i].sort(key=lambda x: x[0])\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "langchain-rag-venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

vectorDbAdsGetterForCSV.py DELETED Viewed

@@ -1,34 +0,0 @@
-from langchain_community.vectorstores import FAISS
-from langchain_community.embeddings import HuggingFaceEmbeddings
-import pandas as pd
-import os
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-DB_FAISS_PATH = "./vectorstore/db_faiss_ads"
-embedding_model_hf = "BAAI/bge-m3"
-embeddings_hf = HuggingFaceEmbeddings(model_name=embedding_model_hf)
-db = FAISS.load_local(
-    DB_FAISS_PATH, embeddings_hf, allow_dangerous_deserialization=True
-)
-df = pd.read_csv("136_results_webmd_healthline_1000 - results_webmd_healthline_1000.tsv", sep = '\t')
-for index, row in df.iterrows():
-    print(index, end = ',')
-    if row['relation_classification'] == 0:
-        api_data = row['publisher_url']
-        retreived_documents = [
-            doc
-            for doc in db.similarity_search_with_score(api_data, k = 20)
-            if doc[1] < 1.05
-        ]
-        docs_info = "\n\n".join(
-            [
-                f"Revenue: {doc[0].metadata['revenue']}\nAd Click Count: {doc[0].metadata['ad_click_count']}\nContent: {doc[0].page_content}\nValue: {doc[1]}"
-                for doc in retreived_documents
-            ]
-        )
-        df.at[index, 'ads_data'] = docs_info
-df.to_csv("136_results_webmd_healthline_1000.tsv", sep='\t')