llm-website-qa / knowledgebase.py
thisisishara's picture
init commit
0fac726
raw
history blame
No virus
7.16 kB
import requests
from bs4 import BeautifulSoup
from langchain.callbacks import get_openai_callback
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
from langchain.llms import OpenAIChat, HuggingFaceHub
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from streamlit.logger import get_logger
from utils.constants import (
KNOWLEDGEBASE_DIR,
AssistantType,
BS_HTML_PARSER,
TEXT_TAG,
SOURCE_TAG,
ANSWER_TAG,
QUESTION_TAG,
HF_TEXT_GENERATION_REPO_ID,
EmbeddingType,
TOTAL_TOKENS_TAG,
PROMPT_TOKENS_TAG,
COMPLETION_TOKENS_TAG,
TOTAL_COST_TAG,
OPENAI_CHAT_COMPLETIONS_MODEL,
)
logger = get_logger(__name__)
def extract_text_from(url_: str):
html = requests.get(url_).text
soup = BeautifulSoup(html, features=BS_HTML_PARSER)
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
return "\n".join(line for line in lines if line)
def create_knowledgebase(
urls: list,
assistant_type: AssistantType,
embedding_type: EmbeddingType,
embedding_api_key: str,
knowledgebase_name: str,
):
pages: list[dict] = []
for url in urls:
pages.append({TEXT_TAG: extract_text_from(url_=url), SOURCE_TAG: url})
chunk_size = 500
chunk_overlap = 30
if assistant_type == AssistantType.OPENAI:
# # override the default chunk configs
# chunk_size = 1500
# chunk_overlap = 200
if embedding_type == EmbeddingType.HUGGINGFACE:
embeddings = HuggingFaceHubEmbeddings(
huggingfacehub_api_token=embedding_api_key
)
logger.info(f"Using `hf` embeddings")
else:
embeddings = OpenAIEmbeddings(openai_api_key=embedding_api_key)
logger.info(f"Using `openai` embeddings")
else:
embeddings = HuggingFaceHubEmbeddings(
huggingfacehub_api_token=embedding_api_key
)
logger.info(
f"Since the assistant type is set to `hf`, `hf` embeddings are used by default."
)
text_splitter = CharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator="\n"
)
docs, metadata = [], []
for page in pages:
splits = text_splitter.split_text(page[TEXT_TAG])
docs.extend(splits)
metadata.extend([{SOURCE_TAG: page[SOURCE_TAG]}] * len(splits))
print(f"Split {page[SOURCE_TAG]} into {len(splits)} chunks")
vectorstore = FAISS.from_texts(texts=docs, embedding=embeddings, metadatas=metadata)
vectorstore.save_local(folder_path=KNOWLEDGEBASE_DIR, index_name=knowledgebase_name)
def load_vectorstore(
embedding_type: EmbeddingType,
embedding_api_key: str,
knowledgebase_name: str,
):
if embedding_type == EmbeddingType.OPENAI:
embeddings = OpenAIEmbeddings(openai_api_key=embedding_api_key)
else:
embeddings = HuggingFaceHubEmbeddings(
huggingfacehub_api_token=embedding_api_key
)
logger.info(
f"Since the assistant type is set to `hf`, `hf` embeddings are used by default."
)
store = FAISS.load_local(
folder_path=KNOWLEDGEBASE_DIR,
embeddings=embeddings,
index_name=knowledgebase_name,
)
return store
def construct_query_response(result: dict) -> dict:
return {ANSWER_TAG: result}
class Knowledgebase:
def __init__(
self,
assistant_type: AssistantType,
embedding_type: EmbeddingType,
assistant_api_key: str,
embedding_api_key: str,
knowledgebase_name: str,
):
self.assistant_type = assistant_type
self.embedding_type = embedding_type
self.assistant_api_key = assistant_api_key
self.embedding_api_key = embedding_api_key
self.knowledgebase = load_vectorstore(
embedding_type=embedding_type,
embedding_api_key=embedding_api_key,
knowledgebase_name=knowledgebase_name,
)
def query_knowledgebase(self, query: str) -> tuple[dict, dict]:
try:
logger.info(
f"The assistant API key for the current session: ***{self.assistant_api_key[-4:]}"
)
logger.info(
f"The embedding API key for the current session: ***{self.embedding_api_key[-4:]}"
)
query = query.strip()
if not query:
return {
ANSWER_TAG: "Oh snap! did you hit send accidentally, because I can't see any questions 🤔",
}, {}
if self.assistant_type == AssistantType.OPENAI:
llm = OpenAIChat(
model_name=OPENAI_CHAT_COMPLETIONS_MODEL,
temperature=0,
verbose=True,
openai_api_key=self.assistant_api_key,
)
# # this is deprecated
# chain = VectorDBQAWithSourcesChain.from_llm(
# llm=llm,
# vectorstore=self.knowledgebase,
# max_tokens_limit=2048,
# k=2,
# reduce_k_below_max_tokens=True,
# )
chain = RetrievalQAWithSourcesChain.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=self.knowledgebase.as_retriever(),
reduce_k_below_max_tokens=True,
chain_type_kwargs={"verbose": True},
)
else:
llm = HuggingFaceHub(
repo_id=HF_TEXT_GENERATION_REPO_ID,
model_kwargs={"temperature": 0.5, "max_length": 64},
huggingfacehub_api_token=self.assistant_api_key,
verbose=True,
)
chain = RetrievalQAWithSourcesChain.from_chain_type(
llm=llm,
chain_type="refine",
retriever=self.knowledgebase.as_retriever(),
max_tokens_limit=1024,
reduce_k_below_max_tokens=True,
chain_type_kwargs={"verbose": True},
)
with get_openai_callback() as cb:
result = chain({QUESTION_TAG: query})
print(f"Total Tokens: {cb.total_tokens}")
print(f"Prompt Tokens: {cb.prompt_tokens}")
print(f"Completion Tokens: {cb.completion_tokens}")
print(f"Total Cost (USD): ${cb.total_cost}")
metadata = {
TOTAL_TOKENS_TAG: cb.total_tokens,
PROMPT_TOKENS_TAG: cb.prompt_tokens,
COMPLETION_TOKENS_TAG: cb.completion_tokens,
TOTAL_COST_TAG: cb.total_cost,
}
return result, metadata
except Exception as e:
logger.error(f"{e.__class__.__name__}: {e}")
return {ANSWER_TAG: f"{e.__class__.__name__}: {e}"}, {}