Spaces:
Sleeping
Sleeping
Rajat.bans
commited on
Commit
•
91bc51f
1
Parent(s):
e29066a
Added the code
Browse files- .DS_Store +0 -0
- .env +1 -0
- .gitattributes +7 -0
- README.md +20 -0
- data/133_ads_vogon_13May-27May.tsv +3 -0
- rag.py +105 -0
- requirements.txt +7 -0
- vectorstore/.DS_Store +0 -0
- vectorstore/db_faiss_ads/index.faiss +3 -0
- vectorstore/db_faiss_ads/index.pkl +3 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.env
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
OPENAI_API_KEY=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
|
.gitattributes
CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.tsv filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/ filter=lfs diff=lfs merge=lfs -text
|
38 |
+
vectorstore/ filter=lfs diff=lfs merge=lfs -text
|
39 |
+
documents/ filter=lfs diff=lfs merge=lfs -text
|
40 |
+
*.faiss filter=lfs diff=lfs merge=lfs -text
|
41 |
+
document/* filter=lfs diff=lfs merge=lfs -text
|
42 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -10,3 +10,23 @@ pinned: false
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
13 |
+
|
14 |
+
|
15 |
+
## Steps for running rag -:
|
16 |
+
1. Create .env file in root folder and add the following environment variables
|
17 |
+
```
|
18 |
+
OPENAI_API_KEY=<YOUR OPENAI KEY>
|
19 |
+
```
|
20 |
+
2. Run the following commands:
|
21 |
+
```
|
22 |
+
pip3 install -r requirements.txt
|
23 |
+
python3 rag.py
|
24 |
+
```
|
25 |
+
|
26 |
+
|
27 |
+
Neo4j RAG course - https://www.deeplearning.ai/short-courses/knowledge-graphs-rag/
|
28 |
+
1. langchain documentation retreiver - https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/vectorstore/
|
29 |
+
2. https://medium.com/@shaktikanungo2019/3. conversational-ai-unveiling-the-first-rag-chatbot-with-langchain-8b9b04ee4b63
|
30 |
+
3. https://medium.com/@vikrambhat2/building-a-rag-system-and-conversational-chatbot-with-custom-data-793e9617a865
|
31 |
+
4. https://abvijaykumar.medium.com/retrieval-augmented-generation-rag-with-llamaindex-1828ef80314c
|
32 |
+
5. https://medium.com/the-ai-forum/implementing-agentic-rag-using-langchain-b22af7f6a3b5
|
data/133_ads_vogon_13May-27May.tsv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:333665767c0d25886b7f33ab98ec0191c216a76e7288deb2cf21f10a9445f65a
|
3 |
+
size 146425733
|
rag.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
from langchain_community.vectorstores import FAISS
|
3 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
4 |
+
import gradio as gr
|
5 |
+
from openai import OpenAI
|
6 |
+
import random
|
7 |
+
import pandas as pd
|
8 |
+
import os
|
9 |
+
|
10 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
11 |
+
|
12 |
+
load_dotenv(override=True)
|
13 |
+
client = OpenAI()
|
14 |
+
DB_FAISS_PATH = "./vectorstore/db_faiss_ads"
|
15 |
+
data_file_path = "./data/133_ads_vogon_13May-27May.tsv"
|
16 |
+
embedding_model_hf = "BAAI/bge-m3"
|
17 |
+
qa_model_name = "gpt-3.5-turbo"
|
18 |
+
bestSystemPrompt = "You're an assistant for question-formulation tasks. You need to form question related to given INPUT and the mutually exclusive options for leading an user to relevant ads. Your approach must be systematic and meticulous. First, identify CLUES such as keywords, phrases, contextual information, semantic relations, tones, and references that aid in determining the context of the INPUT. Second, construct a concise diagnostic REASONING process (limiting to 130 words) based on premises supporting the INPUT relevance within the provided ADS_DATA. Third, utilizing the identified clues, reasoning, and input, furnish the pertinent MUTUALLY EXCLUSIVE options for the ADS_DATA. The ADS_DATA is as follows:"
|
19 |
+
|
20 |
+
embeddings_hf = HuggingFaceEmbeddings(model_name=embedding_model_hf)
|
21 |
+
|
22 |
+
|
23 |
+
def getBestQuestionOnTheBasisOfPageContextAndAdsData(page_information, adsData, systemPrompt):
|
24 |
+
system_message = {"role": "system", "content": systemPrompt + adsData}
|
25 |
+
|
26 |
+
response = client.chat.completions.create(
|
27 |
+
model=qa_model_name,
|
28 |
+
messages=[system_message] + [{"role": "user", "content": page_information}],
|
29 |
+
temperature=0,
|
30 |
+
)
|
31 |
+
answer = response.choices[0].message.content
|
32 |
+
return answer
|
33 |
+
|
34 |
+
|
35 |
+
def getRagResponse(QuestionPrompt, threshold, page_information):
|
36 |
+
curr_question_prompt = bestSystemPrompt
|
37 |
+
if QuestionPrompt != None or len(QuestionPrompt):
|
38 |
+
curr_question_prompt = QuestionPrompt
|
39 |
+
|
40 |
+
retreived_documents = [
|
41 |
+
doc
|
42 |
+
for doc in db.similarity_search_with_score(page_information, k = 20)
|
43 |
+
if doc[1] < threshold
|
44 |
+
]
|
45 |
+
answer = getBestQuestionOnTheBasisOfPageContextAndAdsData(
|
46 |
+
page_information,
|
47 |
+
". ".join([doc[0].page_content for doc in retreived_documents]),
|
48 |
+
curr_question_prompt,
|
49 |
+
)
|
50 |
+
docs_info = "\n\n".join(
|
51 |
+
[
|
52 |
+
f"Publisher url: {doc[0].metadata['publisher_url']}\nKeyword Term: {doc[0].metadata['keyword_term']}\nAd Display Url: {doc[0].metadata['ad_display_url']}\nRevenue: {doc[0].metadata['revenue']}\nAd Click Count: {doc[0].metadata['ad_click_count']}\nContent: {doc[0].page_content}\nValue: {doc[1]}"
|
53 |
+
for doc in retreived_documents
|
54 |
+
]
|
55 |
+
)
|
56 |
+
full_response = f"Answer: {answer}\n\nRetrieved Documents:\n{docs_info}"
|
57 |
+
return full_response
|
58 |
+
|
59 |
+
|
60 |
+
db = FAISS.load_local(
|
61 |
+
DB_FAISS_PATH, embeddings_hf, allow_dangerous_deserialization=True
|
62 |
+
)
|
63 |
+
data = pd.read_csv(data_file_path, sep='\t')
|
64 |
+
data.dropna(axis = 0, how='any', inplace=True)
|
65 |
+
data = data.iloc[:5000, :]
|
66 |
+
ad_title_content = list(data["ad_title"].values)
|
67 |
+
with gr.Blocks() as demo:
|
68 |
+
gr.Markdown("# RAG on ads data")
|
69 |
+
with gr.Row():
|
70 |
+
QuestionPrompt = gr.Textbox(
|
71 |
+
bestSystemPrompt,
|
72 |
+
lines=1,
|
73 |
+
placeholder="Enter the system prompt for question formulation",
|
74 |
+
label="Question System prompt",
|
75 |
+
)
|
76 |
+
page_information = gr.Textbox(
|
77 |
+
lines=1, placeholder="Enter the page information", label="Page Information"
|
78 |
+
)
|
79 |
+
threshold = gr.Number(value = 1.0, label="Threshold", interactive=True)
|
80 |
+
output = gr.Textbox(label="Output")
|
81 |
+
submit_btn = gr.Button("Submit")
|
82 |
+
|
83 |
+
submit_btn.click(
|
84 |
+
getRagResponse,
|
85 |
+
inputs= [QuestionPrompt, threshold, page_information],
|
86 |
+
outputs=[output],
|
87 |
+
)
|
88 |
+
page_information.submit(
|
89 |
+
getRagResponse,
|
90 |
+
inputs=[QuestionPrompt, threshold, page_information],
|
91 |
+
outputs=[output],
|
92 |
+
)
|
93 |
+
with gr.Accordion("Ad Titles", open=False):
|
94 |
+
ad_titles = gr.Markdown()
|
95 |
+
|
96 |
+
demo.load(
|
97 |
+
lambda: "<br>".join(
|
98 |
+
random.sample([str(ad_title) for ad_title in ad_title_content], min(100, len(ad_title_content)))
|
99 |
+
),
|
100 |
+
None,
|
101 |
+
ad_titles,
|
102 |
+
)
|
103 |
+
|
104 |
+
gr.close_all()
|
105 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
python-dotenv
|
3 |
+
langchain
|
4 |
+
langchain-community
|
5 |
+
langchain-openai
|
6 |
+
faiss-cpu
|
7 |
+
sentence-transformers
|
vectorstore/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
vectorstore/db_faiss_ads/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fa26ebd79a0c29b97f5047804c9147819c86aced6d309eeb17e6f4380e88ff4e
|
3 |
+
size 204800045
|
vectorstore/db_faiss_ads/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b756d121fc57d372759c32abbf6dca2d82d35d753f7dc24bf55477269a83123f
|
3 |
+
size 22199403
|