rag / app.py
nklomp's picture
Update app.py
e9d633c verified
raw
history blame contribute delete
No virus
9.11 kB
import streamlit as st
import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceInstructEmbeddings,HuggingFaceEmbeddings,CohereEmbeddings
from langchain_openai import OpenAIEmbeddings,ChatOpenAI
from langchain_community.chat_models import ChatCohere
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from htmlTemplates import css, bot_template, user_template
from langchain_community.llms import HuggingFaceHub,HuggingFaceTextGenInference
#Llama2
import torch
import transformers
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer
from torch import cuda, bfloat16
import langchain
langchain.verbose = False
def get_pdf_text(pdf_docs):
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_text_chunks(text):
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=500, # the character length of the chunck
chunk_overlap=100, # the character length of the overlap between chuncks
length_function=len # the length function - in this case, character length (aka the python len() fn.)
)
chunks = text_splitter.split_text(text)
return chunks
def get_vectorstore(text_chunks,selected_embedding):
print('Selected Embedding: ' + selected_embedding)
if selected_embedding == 'OpenAI':
embeddings = OpenAIEmbeddings()
elif selected_embedding == 'Instructor-xl':
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
elif selected_embedding == 'Cohere-multilingual-v3.0':
embeddings = CohereEmbeddings(model="embed-multilingual-v3.0")
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
vectorstore.save_local("faiss_index")
return vectorstore
def load_vectorstore(text_chunks,selected_embedding):
print('Selected Embedding: ' + selected_embedding)
if selected_embedding == 'OpenAI':
embeddings = OpenAIEmbeddings()
elif selected_embedding == 'Instructor-xl':
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
vectorstore = FAISS.load_local("faiss_index", embeddings)
elif selected_embedding == 'Cohere-multilingual-v3.0':
embeddings = CohereEmbeddings(model="embed-multilingual-v3.0")
vectorstore = FAISS.load_local("faiss_index", embeddings)
return vectorstore
def get_conversation_chain(vectorstore,selected_llm,selected_temperature):
print('Selected LLM: ' + selected_llm)
print('Selected Temperature: ' + str(selected_temperature))
if selected_llm == 'GPT 3.5':
#openai_model = "gpt-4-turbo-preview"
openai_model = "gpt-3.5-turbo"
llm = ChatOpenAI(model=openai_model,temperature=selected_temperature)
elif selected_llm == 'Llama2 local':
model_id = 'meta-llama/Llama-2-7b-chat-hf'
hf_auth = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
model_config = transformers.AutoConfig.from_pretrained(
model_id,
token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
)
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
if('cuda' in device):
# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=bfloat16
)
model = transformers.AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
config=model_config,
quantization_config=bnb_config,
device_map='auto',
token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
)
else:
model = transformers.AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
config=model_config,
device_map='auto',
token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
)
# enable evaluation mode to allow model inference
model.eval()
print(f"Model loaded on {device}")
tokenizer = transformers.AutoTokenizer.from_pretrained(
model_id,
token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
)
pipeline = transformers.pipeline(
torch_dtype=torch.float32,
model=model,
tokenizer=tokenizer,
return_full_text=True, # langchain expects the full text
task='text-generation',
temperature=selected_temperature, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
max_new_tokens=512, # max number of tokens to generate in the output
repetition_penalty=1.1 # without this output begins repeating
)
llm = HuggingFacePipeline(pipeline=pipeline)
elif selected_llm == 'Llama2 inference':
llm = HuggingFaceTextGenInference(
inference_server_url=os.environ.get("INFERENCE_URL"),
max_new_tokens=50,
timeout=1200,
temperature=selected_temperature
)
# Generic LLM
memory = ConversationBufferMemory(
memory_key='chat_history', return_messages=True, output_key='answer')
conversation_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vectorstore.as_retriever(),
memory=memory,
return_source_documents=True,
verbose=True,
)
#print(conversation_chain)
return conversation_chain
def handle_userinput(user_question):
#print('Question: ' + user_question)
response = st.session_state.conversation.invoke({'question': user_question})
anser = response.get("answer")
sources = response.get("source_documents", [])
#print('Answer: ' + anser)
#print('Sources: ' + str(sources))
with st.expander("Sources"):
st.write(str(sources))
st.session_state.chat_history = response['chat_history']
for i, message in enumerate(st.session_state.chat_history):
if i % 2 == 0:
st.write(user_template.replace(
"{{MSG}}", message.content), unsafe_allow_html=True)
else:
st.write(bot_template.replace(
"{{MSG}}", message.content), unsafe_allow_html=True)
def main():
load_dotenv()
st.set_page_config(page_title="VerAi",
page_icon=":books:")
st.write(css, unsafe_allow_html=True)
if "conversation" not in st.session_state:
st.session_state.conversation = None
if "chat_history" not in st.session_state:
st.session_state.chat_history = None
with st.sidebar:
st.subheader("Your documents")
pdf_docs = st.file_uploader(
"Upload your new PDFs here and click on 'Process' or load the last upload by clicking on 'Load'", accept_multiple_files=True)
selected_embedding = st.radio("Which Embedding?",["Cohere-multilingual-v3.0","OpenAI", "Instructor-xl"])
selected_llm = st.radio("Which LLM?",["GPT 3.5", "Llama2 local" ,"Llama2 inference"])
selected_temperature = st.slider('Temperature?', 0.0, 1.0, 0.1)
if st.button("Process"):
with st.spinner("Processing"):
# get pdf text
raw_text = get_pdf_text(pdf_docs)
# get the text chunks
text_chunks = get_text_chunks(raw_text)
# create vector store
vectorstore = get_vectorstore(text_chunks,selected_embedding)
# create conversation chain
st.session_state.conversation = get_conversation_chain(
vectorstore,selected_llm,selected_temperature)
if st.button("Load"):
with st.spinner("Processing"):
# load vector store
vectorstore = load_vectorstore(selected_embedding,selected_embedding)
# create conversation chain
st.session_state.conversation = get_conversation_chain(
vectorstore,selected_llm,selected_temperature)
if st.session_state.conversation:
st.header("VerAi :books:")
user_question = st.text_input("Stel een vraag hieronder")
# Vertel me iets over Wettelijke uren
# wat zijn Overige verloftypes bij kpn
if st.session_state.conversation and user_question:
handle_userinput(user_question)
if __name__ == '__main__':
main()