Spaces:

MediaNetAdsRag
/

Ads_Rag

Sleeping

App Files Files Community

Rajat.bans commited on Jun 1

Commit

91bc51f

•

1 Parent(s): e29066a

Added the code

Browse files

Files changed (10) hide show

.DS_Store +0 -0
.env +1 -0
.gitattributes +7 -0
README.md +20 -0
data/133_ads_vogon_13May-27May.tsv +3 -0
rag.py +105 -0
requirements.txt +7 -0
vectorstore/.DS_Store +0 -0
vectorstore/db_faiss_ads/index.faiss +3 -0
vectorstore/db_faiss_ads/index.pkl +3 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ OPENAI_API_KEY=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

.gitattributes CHANGED Viewed

@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.tsv filter=lfs diff=lfs merge=lfs -text
+data/ filter=lfs diff=lfs merge=lfs -text
+vectorstore/ filter=lfs diff=lfs merge=lfs -text
+documents/ filter=lfs diff=lfs merge=lfs -text
+*.faiss filter=lfs diff=lfs merge=lfs -text
+document/* filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -10,3 +10,23 @@ pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+## Steps for running rag -:
+1. Create .env file in root folder and add the following environment variables
+```
+OPENAI_API_KEY=<YOUR OPENAI KEY>
+```
+2. Run the following commands:
+```
+    pip3 install -r requirements.txt
+    python3 rag.py
+```
+Neo4j RAG course - https://www.deeplearning.ai/short-courses/knowledge-graphs-rag/
+1. langchain documentation retreiver - https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/vectorstore/
+2. https://medium.com/@shaktikanungo2019/3. conversational-ai-unveiling-the-first-rag-chatbot-with-langchain-8b9b04ee4b63
+3. https://medium.com/@vikrambhat2/building-a-rag-system-and-conversational-chatbot-with-custom-data-793e9617a865
+4. https://abvijaykumar.medium.com/retrieval-augmented-generation-rag-with-llamaindex-1828ef80314c
+5. https://medium.com/the-ai-forum/implementing-agentic-rag-using-langchain-b22af7f6a3b5

data/133_ads_vogon_13May-27May.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:333665767c0d25886b7f33ab98ec0191c216a76e7288deb2cf21f10a9445f65a
+size 146425733

rag.py ADDED Viewed

	@@ -0,0 +1,105 @@

+from dotenv import load_dotenv
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+import gradio as gr
+from openai import OpenAI
+import random
+import pandas as pd
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+load_dotenv(override=True)
+client = OpenAI()
+DB_FAISS_PATH = "./vectorstore/db_faiss_ads"
+data_file_path = "./data/133_ads_vogon_13May-27May.tsv"
+embedding_model_hf = "BAAI/bge-m3"
+qa_model_name = "gpt-3.5-turbo"
+bestSystemPrompt = "You're an assistant for question-formulation tasks. You need to form question related to given INPUT and the mutually exclusive options for leading an user to relevant ads. Your approach must be systematic and meticulous. First, identify CLUES such as keywords, phrases, contextual information, semantic relations, tones, and references that aid in determining the context of the INPUT. Second, construct a concise diagnostic REASONING process (limiting to 130 words) based on premises supporting the INPUT relevance within the provided ADS_DATA. Third, utilizing the identified clues, reasoning, and input, furnish the pertinent MUTUALLY EXCLUSIVE options for the ADS_DATA. The ADS_DATA is as follows:"
+embeddings_hf = HuggingFaceEmbeddings(model_name=embedding_model_hf)
+def getBestQuestionOnTheBasisOfPageContextAndAdsData(page_information, adsData, systemPrompt):
+    system_message = {"role": "system", "content": systemPrompt + adsData}
+    response = client.chat.completions.create(
+        model=qa_model_name,
+        messages=[system_message] + [{"role": "user", "content": page_information}],
+        temperature=0,
+    )
+    answer = response.choices[0].message.content
+    return answer
+def getRagResponse(QuestionPrompt, threshold, page_information):
+    curr_question_prompt = bestSystemPrompt
+    if QuestionPrompt != None or len(QuestionPrompt):
+        curr_question_prompt = QuestionPrompt
+    retreived_documents = [
+        doc
+        for doc in db.similarity_search_with_score(page_information, k = 20)
+        if doc[1] < threshold
+    ]
+    answer = getBestQuestionOnTheBasisOfPageContextAndAdsData(
+        page_information,
+        ". ".join([doc[0].page_content for doc in retreived_documents]),
+        curr_question_prompt,
+    )
+    docs_info = "\n\n".join(
+        [
+            f"Publisher url: {doc[0].metadata['publisher_url']}\nKeyword Term: {doc[0].metadata['keyword_term']}\nAd Display Url: {doc[0].metadata['ad_display_url']}\nRevenue: {doc[0].metadata['revenue']}\nAd Click Count: {doc[0].metadata['ad_click_count']}\nContent: {doc[0].page_content}\nValue: {doc[1]}"
+            for doc in retreived_documents
+        ]
+    )
+    full_response = f"Answer: {answer}\n\nRetrieved Documents:\n{docs_info}"
+    return full_response
+db = FAISS.load_local(
+    DB_FAISS_PATH, embeddings_hf, allow_dangerous_deserialization=True
+)
+data = pd.read_csv(data_file_path, sep='\t')
+data.dropna(axis = 0, how='any', inplace=True)
+data = data.iloc[:5000, :]
+ad_title_content = list(data["ad_title"].values)
+with gr.Blocks() as demo:
+    gr.Markdown("# RAG on ads data")
+    with gr.Row():
+        QuestionPrompt = gr.Textbox(
+            bestSystemPrompt,
+            lines=1,
+            placeholder="Enter the system prompt for question formulation",
+            label="Question System prompt",
+        )
+        page_information = gr.Textbox(
+            lines=1, placeholder="Enter the page information", label="Page Information"
+        )
+        threshold = gr.Number(value = 1.0, label="Threshold", interactive=True)
+    output = gr.Textbox(label="Output")
+    submit_btn = gr.Button("Submit")
+    submit_btn.click(
+        getRagResponse,
+        inputs= [QuestionPrompt, threshold, page_information],
+        outputs=[output],
+    )
+    page_information.submit(
+        getRagResponse,
+        inputs=[QuestionPrompt, threshold, page_information],
+        outputs=[output],
+    )
+    with gr.Accordion("Ad Titles", open=False):
+        ad_titles = gr.Markdown()
+    demo.load(
+        lambda: "<br>".join(
+            random.sample([str(ad_title) for ad_title in ad_title_content], min(100, len(ad_title_content)))
+        ),
+        None,
+        ad_titles,
+    )
+gr.close_all()
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+python-dotenv
+langchain
+langchain-community
+langchain-openai
+faiss-cpu
+sentence-transformers

vectorstore/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

vectorstore/db_faiss_ads/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa26ebd79a0c29b97f5047804c9147819c86aced6d309eeb17e6f4380e88ff4e
+size 204800045

vectorstore/db_faiss_ads/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b756d121fc57d372759c32abbf6dca2d82d35d753f7dc24bf55477269a83123f
+size 22199403