Commit
•
4f7de21
0
Parent(s):
initial commit
Browse files- .DS_Store +0 -0
- .env +1 -0
- .gitattributes +42 -0
- LICENSE +21 -0
- README.md +29 -0
- backupcodes/llamaIndexSingleQuery.py +32 -0
- backupcodes/pdfreader.py +18 -0
- backupcodes/streamlist_LLAMA.py +154 -0
- data/131_webmd_vogon_sample1000_urlsContent_cleaned.tsv +3 -0
- data/132_webmd_vogon_urlsContent_cleaned.tsv +3 -0
- documents/ADD and ADHD (Attention Deficit Hyperactivity Disorder) Health Center.pdf +3 -0
- documents/Media.net — WebMD - Better information. Better health..pdf +3 -0
- documents/WebMD Allergies Health Center - Find allergy information and latest health news.pdf +3 -0
- documents/WebMD Arthritis and Joint Pain Center: Symptoms, Causes, Tests, and Treatments.pdf +3 -0
- documents/WebMD Health News Center - The latest breaking health news and alerts.pdf +3 -0
- rag.py +134 -0
- requirements.txt +6 -0
- vectorstore/.DS_Store +0 -0
- vectorstore/db_faiss_10/index.faiss +3 -0
- vectorstore/db_faiss_10/index.pkl +3 -0
- vectorstore/db_faiss_50k/index.faiss +3 -0
- vectorstore/db_faiss_50k/index.pkl +3 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.env
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
OPENAI_API_KEY=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
|
.gitattributes
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.tsv filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/ filter=lfs diff=lfs merge=lfs -text
|
38 |
+
vectorstore/ filter=lfs diff=lfs merge=lfs -text
|
39 |
+
documents/ filter=lfs diff=lfs merge=lfs -text
|
40 |
+
*.faiss filter=lfs diff=lfs merge=lfs -text
|
41 |
+
document/* filter=lfs diff=lfs merge=lfs -text
|
42 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2024 Rajat Bansal
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Publisher Chatbot 50k
|
3 |
+
emoji: 📈
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: blue
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.31.5
|
8 |
+
app_file: rag.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
## Steps for running rag -:
|
13 |
+
1. Create .env file in root folder and add the following environment variables
|
14 |
+
```
|
15 |
+
OPENAI_API_KEY=<YOUR OPENAI KEY>
|
16 |
+
```
|
17 |
+
2. Run the following commands:
|
18 |
+
```
|
19 |
+
pip3 install -r requirements.txt
|
20 |
+
python3 rag.py
|
21 |
+
```
|
22 |
+
|
23 |
+
|
24 |
+
Neo4j RAG course - https://www.deeplearning.ai/short-courses/knowledge-graphs-rag/
|
25 |
+
1. langchain documentation retreiver - https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/vectorstore/
|
26 |
+
2. https://medium.com/@shaktikanungo2019/3. conversational-ai-unveiling-the-first-rag-chatbot-with-langchain-8b9b04ee4b63
|
27 |
+
3. https://medium.com/@vikrambhat2/building-a-rag-system-and-conversational-chatbot-with-custom-data-793e9617a865
|
28 |
+
4. https://abvijaykumar.medium.com/retrieval-augmented-generation-rag-with-llamaindex-1828ef80314c
|
29 |
+
5. https://medium.com/the-ai-forum/implementing-agentic-rag-using-langchain-b22af7f6a3b5
|
backupcodes/llamaIndexSingleQuery.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from llama_index.core import (
|
2 |
+
VectorStoreIndex,
|
3 |
+
SimpleDirectoryReader,
|
4 |
+
StorageContext,
|
5 |
+
load_index_from_storage,
|
6 |
+
)
|
7 |
+
from llama_index.llms.openai import OpenAI
|
8 |
+
from llama_index.core.settings import Settings
|
9 |
+
import os
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
|
12 |
+
load_dotenv(override=True)
|
13 |
+
Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
|
14 |
+
|
15 |
+
storage_path = "../vectorstore"
|
16 |
+
documents_path = "../../documents"
|
17 |
+
|
18 |
+
|
19 |
+
def initialize():
|
20 |
+
if not os.path.exists(storage_path):
|
21 |
+
documents = SimpleDirectoryReader(documents_path).load_data()
|
22 |
+
index = VectorStoreIndex.from_documents(documents)
|
23 |
+
index.storage_context.persist(persist_dir=storage_path)
|
24 |
+
else:
|
25 |
+
storage_context = StorageContext.from_defaults(persist_dir=storage_path)
|
26 |
+
index = load_index_from_storage(storage_context)
|
27 |
+
return index
|
28 |
+
index = initialize()
|
29 |
+
|
30 |
+
chat_engine = index.as_chat_engine(chat_mode="condense_question", verbose=True)
|
31 |
+
response = chat_engine.chat("hi tell me what i can ask you")
|
32 |
+
print(response.response)
|
backupcodes/pdfreader.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
docs = []
|
2 |
+
metadata = []
|
3 |
+
|
4 |
+
# Read PDF documents from the given path
|
5 |
+
pdf_docs = [os.path.join(data_path, f) for f in os.listdir(data_path) if f.endswith('.pdf')]
|
6 |
+
for pdf_path in pdf_docs:
|
7 |
+
with open(pdf_path, "rb") as pdf_file:
|
8 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
9 |
+
for index, page in enumerate(pdf_reader.pages):
|
10 |
+
doc_page = {
|
11 |
+
"title": os.path.basename(pdf_path) + " page " + str(index + 1),
|
12 |
+
"content": page.extract_text(),
|
13 |
+
}
|
14 |
+
docs.append(doc_page)
|
15 |
+
|
16 |
+
content = [doc["content"] for doc in docs]
|
17 |
+
metadata = [{"title": doc["title"]} for doc in docs]
|
18 |
+
print("Content and metadata are extracted from the documents")
|
backupcodes/streamlist_LLAMA.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import PyPDF2
|
2 |
+
import streamlit as st
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain
|
6 |
+
from langchain_community.vectorstores import FAISS
|
7 |
+
from langchain.memory import ConversationBufferMemory
|
8 |
+
from langchain.prompts import PromptTemplate
|
9 |
+
|
10 |
+
# from langchain_community.llms import llamacpp
|
11 |
+
# from langchain_community.embeddings import HuggingFaceEmbeddings
|
12 |
+
from langchain_openai import OpenAIEmbeddings
|
13 |
+
from langchain_openai import ChatOpenAI
|
14 |
+
import os
|
15 |
+
|
16 |
+
llmtemplate = """[INST]
|
17 |
+
As an AI, provide accurate and relevant information based on the provided document. Your responses should adhere to the following guidelines:
|
18 |
+
- Answer the question based on the provided documents.
|
19 |
+
- Be direct and factual, limited to 50 words and 2-3 sentences. Begin your response without using introductory phrases like yes, no etc.
|
20 |
+
- Maintain an ethical and unbiased tone, avoiding harmful or offensive content.
|
21 |
+
- Avoid using confirmatory phrases like "Yes, you are correct" or any similar validation in your responses.
|
22 |
+
- Do not fabricate information or include questions in your responses.
|
23 |
+
- do not prompt to select answers. do not ask me questions
|
24 |
+
{question}
|
25 |
+
|
26 |
+
[/INST]
|
27 |
+
"""
|
28 |
+
prompt_template = """Use the following pieces of context and previous questions and answers to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
29 |
+
|
30 |
+
{context}
|
31 |
+
|
32 |
+
Previous Q&A: {previous_qa}
|
33 |
+
|
34 |
+
Question: {question}
|
35 |
+
Helpful Answer:"""
|
36 |
+
|
37 |
+
# PDF_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
38 |
+
# LLAMA_MODEL_PATH = "llama-2-7b-chat.Q4_K_M.gguf"
|
39 |
+
DB_FAISS_PATH = "../vectorstore/db_faiss"
|
40 |
+
CHUNK_SIZE = 512
|
41 |
+
CHUNK_OVERLAP = 256
|
42 |
+
SIMILARITY_THRESHOLD = 0.5
|
43 |
+
|
44 |
+
|
45 |
+
def prepare_db(pdf_docs):
|
46 |
+
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
47 |
+
if not os.path.exists(DB_FAISS_PATH):
|
48 |
+
docs = []
|
49 |
+
metadata = []
|
50 |
+
content = []
|
51 |
+
|
52 |
+
for pdf in pdf_docs:
|
53 |
+
pdf_reader = PyPDF2.PdfReader(pdf)
|
54 |
+
for index, page in enumerate(pdf_reader.pages):
|
55 |
+
doc_page = {
|
56 |
+
"title": pdf.name + " page " + str(index + 1),
|
57 |
+
"content": page.extract_text(),
|
58 |
+
}
|
59 |
+
docs.append(doc_page)
|
60 |
+
for doc in docs:
|
61 |
+
content.append(doc["content"])
|
62 |
+
metadata.append({"title": doc["title"]})
|
63 |
+
print("Content and metadata are extracted from the documents")
|
64 |
+
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
65 |
+
chunk_size=CHUNK_SIZE,
|
66 |
+
chunk_overlap=CHUNK_OVERLAP,
|
67 |
+
)
|
68 |
+
split_docs = text_splitter.create_documents(content, metadatas=metadata)
|
69 |
+
print(f"Documents are split into {len(split_docs)} passages")
|
70 |
+
# embeddings = HuggingFaceEmbeddings(
|
71 |
+
# model_name=PDF_MODEL_NAME,
|
72 |
+
# model_kwargs={"device": "cpu"},
|
73 |
+
# )
|
74 |
+
db = FAISS.from_documents(split_docs, embeddings)
|
75 |
+
print(f"Document saved in db")
|
76 |
+
db.save_local(DB_FAISS_PATH)
|
77 |
+
else:
|
78 |
+
print(f"Db already exists")
|
79 |
+
db = FAISS.load_local("./vectorstore/db_faiss", embeddings, allow_dangerous_deserialization=True)
|
80 |
+
return db
|
81 |
+
|
82 |
+
|
83 |
+
def get_conversation_chain(vectordb):
|
84 |
+
conversation_chain = ConversationalRetrievalChain.from_llm(
|
85 |
+
# llm=llamacpp.LlamaCpp(
|
86 |
+
# model_path=LLAMA_MODEL_PATH,
|
87 |
+
# temperature=0.75,
|
88 |
+
# max_tokens=200,
|
89 |
+
# top_p=1,
|
90 |
+
# n_ctx=3000,
|
91 |
+
# verbose=False,
|
92 |
+
# ),
|
93 |
+
llm = ChatOpenAI(model="gpt-3.5-turbo"),
|
94 |
+
retriever=vectordb.as_retriever(),
|
95 |
+
condense_question_prompt= PromptTemplate.from_template(llmtemplate),
|
96 |
+
|
97 |
+
memory=ConversationBufferMemory(
|
98 |
+
memory_key="chat_history", return_messages=True, output_key="answer"
|
99 |
+
),
|
100 |
+
return_source_documents=True,
|
101 |
+
)
|
102 |
+
print("Conversation chain created")
|
103 |
+
return conversation_chain
|
104 |
+
|
105 |
+
|
106 |
+
# def validate_answer_against_sources(response_answer, source_documents):
|
107 |
+
# model = SentenceTransformer(PDF_MODEL_NAME)
|
108 |
+
# source_texts = [doc.page_content for doc in source_documents]
|
109 |
+
# answer_embedding = model.encode(response_answer, convert_to_tensor=True)
|
110 |
+
# source_embeddings = model.encode(source_texts, convert_to_tensor=True)
|
111 |
+
# cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)
|
112 |
+
# if any(score.item() > SIMILARITY_THRESHOLD for score in cosine_scores[0]):
|
113 |
+
# return True
|
114 |
+
# return False
|
115 |
+
|
116 |
+
|
117 |
+
def handle_userinput(user_question):
|
118 |
+
response = st.session_state.conversation({"question": user_question})
|
119 |
+
st.session_state.chat_history = response["chat_history"]
|
120 |
+
|
121 |
+
for i, message in enumerate(st.session_state.chat_history):
|
122 |
+
template = "<div style='color: blue;'>{{MSG}}</div>"
|
123 |
+
if i%2 != 0:
|
124 |
+
template = "<div style='color: green;'>{{MSG}}</div>"
|
125 |
+
st.write(
|
126 |
+
template.replace("{{MSG}}", str(i) + ': ' + message.content),
|
127 |
+
unsafe_allow_html=True,
|
128 |
+
)
|
129 |
+
|
130 |
+
|
131 |
+
def main():
|
132 |
+
load_dotenv(override=True)
|
133 |
+
st.set_page_config(page_title="Chat with your PDFs", page_icon=":books:")
|
134 |
+
if "conversation" not in st.session_state:
|
135 |
+
st.session_state.conversation = None
|
136 |
+
if "chat_history" not in st.session_state:
|
137 |
+
st.session_state.chat_history = []
|
138 |
+
st.header("Chat with multiple PDFs :books:")
|
139 |
+
user_question = st.text_input("Ask a question about your documents:")
|
140 |
+
if user_question:
|
141 |
+
handle_userinput(user_question)
|
142 |
+
with st.sidebar:
|
143 |
+
st.subheader("Your documents")
|
144 |
+
pdf_docs = st.file_uploader(
|
145 |
+
"Upload your PDFs here and click on 'Process'", accept_multiple_files=True
|
146 |
+
)
|
147 |
+
if st.button("Process"):
|
148 |
+
with st.spinner("Processing"):
|
149 |
+
vectorstore = prepare_db(pdf_docs)
|
150 |
+
# print(vectorstore.similarity_search("Tell me about add and adhd"))
|
151 |
+
st.session_state.conversation = get_conversation_chain(vectorstore)
|
152 |
+
|
153 |
+
if __name__ == "__main__":
|
154 |
+
main()
|
data/131_webmd_vogon_sample1000_urlsContent_cleaned.tsv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ece612a60d11e19ca95d1ae6af58e6c968d98b014459c8afbd298b57afae4cf4
|
3 |
+
size 157522
|
data/132_webmd_vogon_urlsContent_cleaned.tsv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb0b5985d1bc8610d28603f556c91194c2f755939dc235bdf5d3b77524f10b05
|
3 |
+
size 197658055
|
documents/ADD and ADHD (Attention Deficit Hyperactivity Disorder) Health Center.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:872fa6d12d05e40655b03de827150868f2233828843e5363e89fcfdb2dd66c57
|
3 |
+
size 2362135
|
documents/Media.net — WebMD - Better information. Better health..pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bec7051b94443c051b75f1aade0904b3d413c7ca11420787b93d152512dd7487
|
3 |
+
size 5966263
|
documents/WebMD Allergies Health Center - Find allergy information and latest health news.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5d1591c0ddd6b184c3cfc4514d5613017a7a9054dbc7021ee4777a8a9eca0608
|
3 |
+
size 1367936
|
documents/WebMD Arthritis and Joint Pain Center: Symptoms, Causes, Tests, and Treatments.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9a9c402c5f7c8d8ae0c02dc6c36426d1cf2ed538c5d8ae3e41203e1fed4d095c
|
3 |
+
size 672808
|
documents/WebMD Health News Center - The latest breaking health news and alerts.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:119e2f7dd9de99275ac13d7ba06f7984dc43c7b0e44006360ef784e3b44076e7
|
3 |
+
size 1921177
|
rag.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from langchain_community.vectorstores import FAISS
|
4 |
+
# from langchain_openai import OpenAIEmbeddings
|
5 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
6 |
+
import os
|
7 |
+
import pandas as pd
|
8 |
+
|
9 |
+
import gradio as gr
|
10 |
+
from openai import OpenAI
|
11 |
+
|
12 |
+
load_dotenv(override=True)
|
13 |
+
client = OpenAI()
|
14 |
+
DB_FAISS_PATH = "./vectorstore/db_faiss_50k"
|
15 |
+
data_file_path = "./data/132_webmd_vogon_urlsContent_cleaned.tsv"
|
16 |
+
|
17 |
+
# DB_FAISS_PATH = "./vectorstore/db_faiss_10"
|
18 |
+
# data_file_path = "./data/131_webmd_vogon_sample1000_urlsContent_cleaned.tsv"
|
19 |
+
|
20 |
+
CHUNK_SIZE = 512
|
21 |
+
CHUNK_OVERLAP = 128
|
22 |
+
# embedding_model_oa = "text-embedding-3-small"
|
23 |
+
embedding_model_hf = "BAAI/bge-m3"
|
24 |
+
# embedding_model_hf = "sentence-transformers/all-mpnet-base-v2"
|
25 |
+
qa_model_name = "gpt-3.5-turbo"
|
26 |
+
bestReformulationPrompt = "Given a chat history and the latest user question, which may reference context from the chat history, you must formulate a standalone question that can be understood without the chat history. You are strictly forbidden from using any outside knowledge. Do not, under any circumstances, answer the question. Reformulate it if necessary; otherwise, return it as is."
|
27 |
+
bestSystemPrompt = "You're an assistant for question-answering tasks. Under absolutely no circumstances should you use external knowledge or go beyond the provided preknowledge. Your approach must be systematic and meticulous. First, identify CLUES such as keywords, phrases, contextual information, semantic relations, tones, and references that aid in determining the context of the input. Second, construct a concise diagnostic REASONING process (limiting to 130 words) based on premises supporting the INPUT relevance within the provided context. Third, utilizing the identified clues, reasoning, and input, furnish the pertinent answer for the question. Remember, you are required to use ONLY the provided context to answer the questions. If the question does not align with the preknowledge or if the preknowledge is absent, state that you don't know the answer. External knowledge is strictly prohibited. Failure to adhere will result in incorrect answers. The preknowledge is as follows:"
|
28 |
+
|
29 |
+
# embeddings_oa = OpenAIEmbeddings(model=embedding_model_oa)
|
30 |
+
embeddings_hf = HuggingFaceEmbeddings(model_name = embedding_model_hf, show_progress = True)
|
31 |
+
|
32 |
+
def setupDb(data_path):
|
33 |
+
df = pd.read_csv(data_path, sep="\t")
|
34 |
+
relevant_content = df["url"].values
|
35 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
36 |
+
chunk_size=CHUNK_SIZE,
|
37 |
+
chunk_overlap=CHUNK_OVERLAP,
|
38 |
+
)
|
39 |
+
|
40 |
+
if not os.path.exists(DB_FAISS_PATH):
|
41 |
+
split_docs = text_splitter.create_documents(
|
42 |
+
df["url_content"].tolist(),
|
43 |
+
metadatas=[
|
44 |
+
{"title": row["url_title"], "url": row["url"]}
|
45 |
+
for _, row in df.iterrows()
|
46 |
+
],
|
47 |
+
)
|
48 |
+
print(f"Documents are split into {len(split_docs)} passages")
|
49 |
+
|
50 |
+
db = FAISS.from_documents(split_docs, embeddings_hf)
|
51 |
+
print(f"Document saved in db")
|
52 |
+
db.save_local(DB_FAISS_PATH + "/index_1")
|
53 |
+
else:
|
54 |
+
print(f"Db already exists")
|
55 |
+
db = FAISS.load_local(
|
56 |
+
DB_FAISS_PATH, embeddings_hf, allow_dangerous_deserialization=True
|
57 |
+
)
|
58 |
+
return db, relevant_content
|
59 |
+
|
60 |
+
def reformulate_question(chat_history, latest_question, reformulationPrompt):
|
61 |
+
system_message = {
|
62 |
+
"role": "system",
|
63 |
+
"content": reformulationPrompt
|
64 |
+
}
|
65 |
+
|
66 |
+
formatted_history = []
|
67 |
+
for i, chat in enumerate(chat_history):
|
68 |
+
formatted_history.append({"role": "user", "content": chat[0]})
|
69 |
+
formatted_history.append({"role": "assistant", "content": chat[1]})
|
70 |
+
# print("History -------------->", formatted_history)
|
71 |
+
|
72 |
+
formatted_history.append({"role": "user", "content": latest_question})
|
73 |
+
response = client.chat.completions.create(
|
74 |
+
model="gpt-3.5-turbo",
|
75 |
+
messages=[system_message] + formatted_history,
|
76 |
+
temperature=0
|
77 |
+
)
|
78 |
+
|
79 |
+
reformulated_question = response.choices[0].message.content
|
80 |
+
return reformulated_question
|
81 |
+
|
82 |
+
def getQuestionAnswerOnTheBasisOfContext(question, context, systemPrompt):
|
83 |
+
system_message = {
|
84 |
+
"role": "system",
|
85 |
+
"content": systemPrompt + context
|
86 |
+
}
|
87 |
+
|
88 |
+
response = client.chat.completions.create(
|
89 |
+
model=qa_model_name,
|
90 |
+
messages=[system_message] + [{"role": "user", "content": question}],
|
91 |
+
temperature=0
|
92 |
+
)
|
93 |
+
answer = response.choices[0].message.content
|
94 |
+
return answer
|
95 |
+
|
96 |
+
|
97 |
+
def chatWithRag(reformulationPrompt, QAPrompt, question):
|
98 |
+
global curr_question_no, chat_history
|
99 |
+
curr_question_prompt = bestSystemPrompt
|
100 |
+
if QAPrompt != None or len(QAPrompt):
|
101 |
+
curr_question_prompt = QAPrompt
|
102 |
+
|
103 |
+
# reformulated_query = reformulate_question(chat_history, question, reformulationPrompt)
|
104 |
+
reformulated_query = question
|
105 |
+
retreived_documents = [doc for doc in db.similarity_search_with_score(reformulated_query) if doc[1] < 1.3]
|
106 |
+
answer = getQuestionAnswerOnTheBasisOfContext(reformulated_query, '. '.join([doc[0].page_content for doc in retreived_documents]), curr_question_prompt)
|
107 |
+
chat_history.append((question, answer))
|
108 |
+
curr_question_no += 1
|
109 |
+
docs_info = "\n\n".join([
|
110 |
+
f"Title: {doc[0].metadata['title']}\nUrl: {doc[0].metadata['url']}\nContent: {doc[0].page_content}\nValue: {doc[1]}" for doc in retreived_documents
|
111 |
+
])
|
112 |
+
full_response = f"Answer: {answer}\n\nReformulated question: {reformulated_query}\nRetrieved Documents:\n{docs_info}"
|
113 |
+
# print(question, full_response)
|
114 |
+
return full_response
|
115 |
+
|
116 |
+
db, relevant_content = setupDb(data_file_path)
|
117 |
+
chat_history = []
|
118 |
+
curr_question_no = 1
|
119 |
+
|
120 |
+
with gr.Blocks() as demo:
|
121 |
+
gr.Markdown("# RAG on webmd")
|
122 |
+
with gr.Row():
|
123 |
+
reformulationPrompt = gr.Textbox(bestReformulationPrompt, lines=1, placeholder="Enter the system prompt for reformulation of query", label="Reformulation System prompt")
|
124 |
+
QAPrompt = gr.Textbox(bestSystemPrompt, lines=1, placeholder="Enter the system prompt for QA.", label="QA System prompt")
|
125 |
+
question = gr.Textbox(lines=1, placeholder="Enter the question asked", label="Question")
|
126 |
+
output = gr.Textbox(label="Output")
|
127 |
+
submit_btn = gr.Button("Submit")
|
128 |
+
submit_btn.click(chatWithRag, inputs=[reformulationPrompt, QAPrompt, question], outputs=output)
|
129 |
+
question.submit(chatWithRag, [reformulationPrompt, QAPrompt, question], [output])
|
130 |
+
with gr.Accordion("Urls", open=False):
|
131 |
+
gr.Markdown(', '.join(relevant_content))
|
132 |
+
|
133 |
+
gr.close_all()
|
134 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
python-dotenv
|
3 |
+
langchain
|
4 |
+
langchain_community
|
5 |
+
langchain_openai
|
6 |
+
faiss-cpu
|
vectorstore/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
vectorstore/db_faiss_10/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e15f7b6feed6be9100fe75f074fa861e6d80fec0b10ac60f902c6b0980aa280
|
3 |
+
size 3704877
|
vectorstore/db_faiss_10/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f4c9e2ecd619a27680b0ddede61f3745ee764284fab71ad0f95a83487f699f6
|
3 |
+
size 372279
|
vectorstore/db_faiss_50k/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c7454ff939aed934f1a9741bda8d9d4ad0962c1d6c22737c5bd9d98f3a91b25e
|
3 |
+
size 2059517997
|
vectorstore/db_faiss_50k/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:62959ed49b05c3081c612a3b951c427238098bafc7b7ad74da67e61da6af1167
|
3 |
+
size 321147051
|