Rajat.bans commited on
Commit
91bc51f
1 Parent(s): e29066a

Added the code

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY=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
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.tsv filter=lfs diff=lfs merge=lfs -text
37
+ data/ filter=lfs diff=lfs merge=lfs -text
38
+ vectorstore/ filter=lfs diff=lfs merge=lfs -text
39
+ documents/ filter=lfs diff=lfs merge=lfs -text
40
+ *.faiss filter=lfs diff=lfs merge=lfs -text
41
+ document/* filter=lfs diff=lfs merge=lfs -text
42
+ *.pdf filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -10,3 +10,23 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+
14
+
15
+ ## Steps for running rag -:
16
+ 1. Create .env file in root folder and add the following environment variables
17
+ ```
18
+ OPENAI_API_KEY=<YOUR OPENAI KEY>
19
+ ```
20
+ 2. Run the following commands:
21
+ ```
22
+ pip3 install -r requirements.txt
23
+ python3 rag.py
24
+ ```
25
+
26
+
27
+ Neo4j RAG course - https://www.deeplearning.ai/short-courses/knowledge-graphs-rag/
28
+ 1. langchain documentation retreiver - https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/vectorstore/
29
+ 2. https://medium.com/@shaktikanungo2019/3. conversational-ai-unveiling-the-first-rag-chatbot-with-langchain-8b9b04ee4b63
30
+ 3. https://medium.com/@vikrambhat2/building-a-rag-system-and-conversational-chatbot-with-custom-data-793e9617a865
31
+ 4. https://abvijaykumar.medium.com/retrieval-augmented-generation-rag-with-llamaindex-1828ef80314c
32
+ 5. https://medium.com/the-ai-forum/implementing-agentic-rag-using-langchain-b22af7f6a3b5
data/133_ads_vogon_13May-27May.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:333665767c0d25886b7f33ab98ec0191c216a76e7288deb2cf21f10a9445f65a
3
+ size 146425733
rag.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from langchain_community.vectorstores import FAISS
3
+ from langchain_community.embeddings import HuggingFaceEmbeddings
4
+ import gradio as gr
5
+ from openai import OpenAI
6
+ import random
7
+ import pandas as pd
8
+ import os
9
+
10
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
11
+
12
+ load_dotenv(override=True)
13
+ client = OpenAI()
14
+ DB_FAISS_PATH = "./vectorstore/db_faiss_ads"
15
+ data_file_path = "./data/133_ads_vogon_13May-27May.tsv"
16
+ embedding_model_hf = "BAAI/bge-m3"
17
+ qa_model_name = "gpt-3.5-turbo"
18
+ bestSystemPrompt = "You're an assistant for question-formulation tasks. You need to form question related to given INPUT and the mutually exclusive options for leading an user to relevant ads. Your approach must be systematic and meticulous. First, identify CLUES such as keywords, phrases, contextual information, semantic relations, tones, and references that aid in determining the context of the INPUT. Second, construct a concise diagnostic REASONING process (limiting to 130 words) based on premises supporting the INPUT relevance within the provided ADS_DATA. Third, utilizing the identified clues, reasoning, and input, furnish the pertinent MUTUALLY EXCLUSIVE options for the ADS_DATA. The ADS_DATA is as follows:"
19
+
20
+ embeddings_hf = HuggingFaceEmbeddings(model_name=embedding_model_hf)
21
+
22
+
23
+ def getBestQuestionOnTheBasisOfPageContextAndAdsData(page_information, adsData, systemPrompt):
24
+ system_message = {"role": "system", "content": systemPrompt + adsData}
25
+
26
+ response = client.chat.completions.create(
27
+ model=qa_model_name,
28
+ messages=[system_message] + [{"role": "user", "content": page_information}],
29
+ temperature=0,
30
+ )
31
+ answer = response.choices[0].message.content
32
+ return answer
33
+
34
+
35
+ def getRagResponse(QuestionPrompt, threshold, page_information):
36
+ curr_question_prompt = bestSystemPrompt
37
+ if QuestionPrompt != None or len(QuestionPrompt):
38
+ curr_question_prompt = QuestionPrompt
39
+
40
+ retreived_documents = [
41
+ doc
42
+ for doc in db.similarity_search_with_score(page_information, k = 20)
43
+ if doc[1] < threshold
44
+ ]
45
+ answer = getBestQuestionOnTheBasisOfPageContextAndAdsData(
46
+ page_information,
47
+ ". ".join([doc[0].page_content for doc in retreived_documents]),
48
+ curr_question_prompt,
49
+ )
50
+ docs_info = "\n\n".join(
51
+ [
52
+ f"Publisher url: {doc[0].metadata['publisher_url']}\nKeyword Term: {doc[0].metadata['keyword_term']}\nAd Display Url: {doc[0].metadata['ad_display_url']}\nRevenue: {doc[0].metadata['revenue']}\nAd Click Count: {doc[0].metadata['ad_click_count']}\nContent: {doc[0].page_content}\nValue: {doc[1]}"
53
+ for doc in retreived_documents
54
+ ]
55
+ )
56
+ full_response = f"Answer: {answer}\n\nRetrieved Documents:\n{docs_info}"
57
+ return full_response
58
+
59
+
60
+ db = FAISS.load_local(
61
+ DB_FAISS_PATH, embeddings_hf, allow_dangerous_deserialization=True
62
+ )
63
+ data = pd.read_csv(data_file_path, sep='\t')
64
+ data.dropna(axis = 0, how='any', inplace=True)
65
+ data = data.iloc[:5000, :]
66
+ ad_title_content = list(data["ad_title"].values)
67
+ with gr.Blocks() as demo:
68
+ gr.Markdown("# RAG on ads data")
69
+ with gr.Row():
70
+ QuestionPrompt = gr.Textbox(
71
+ bestSystemPrompt,
72
+ lines=1,
73
+ placeholder="Enter the system prompt for question formulation",
74
+ label="Question System prompt",
75
+ )
76
+ page_information = gr.Textbox(
77
+ lines=1, placeholder="Enter the page information", label="Page Information"
78
+ )
79
+ threshold = gr.Number(value = 1.0, label="Threshold", interactive=True)
80
+ output = gr.Textbox(label="Output")
81
+ submit_btn = gr.Button("Submit")
82
+
83
+ submit_btn.click(
84
+ getRagResponse,
85
+ inputs= [QuestionPrompt, threshold, page_information],
86
+ outputs=[output],
87
+ )
88
+ page_information.submit(
89
+ getRagResponse,
90
+ inputs=[QuestionPrompt, threshold, page_information],
91
+ outputs=[output],
92
+ )
93
+ with gr.Accordion("Ad Titles", open=False):
94
+ ad_titles = gr.Markdown()
95
+
96
+ demo.load(
97
+ lambda: "<br>".join(
98
+ random.sample([str(ad_title) for ad_title in ad_title_content], min(100, len(ad_title_content)))
99
+ ),
100
+ None,
101
+ ad_titles,
102
+ )
103
+
104
+ gr.close_all()
105
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ python-dotenv
3
+ langchain
4
+ langchain-community
5
+ langchain-openai
6
+ faiss-cpu
7
+ sentence-transformers
vectorstore/.DS_Store ADDED
Binary file (6.15 kB). View file
 
vectorstore/db_faiss_ads/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa26ebd79a0c29b97f5047804c9147819c86aced6d309eeb17e6f4380e88ff4e
3
+ size 204800045
vectorstore/db_faiss_ads/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b756d121fc57d372759c32abbf6dca2d82d35d753f7dc24bf55477269a83123f
3
+ size 22199403