Spaces:

nklomp
/

rag

Running

App Files Files Community

rag / app.py

nklomp

Update app.py

e9d633c verified 6 months ago

raw

history blame contribute delete

No virus

9.11 kB

	import streamlit as st
	import os
	from dotenv import load_dotenv
	from PyPDF2 import PdfReader
	from langchain.text_splitter import CharacterTextSplitter
	from langchain_community.embeddings import HuggingFaceInstructEmbeddings,HuggingFaceEmbeddings,CohereEmbeddings
	from langchain_openai import OpenAIEmbeddings,ChatOpenAI
	from langchain_community.chat_models import ChatCohere
	from langchain_community.vectorstores import FAISS
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import ConversationalRetrievalChain
	from htmlTemplates import css, bot_template, user_template
	from langchain_community.llms import HuggingFaceHub,HuggingFaceTextGenInference


	#Llama2
	import torch
	import transformers
	from langchain_community.llms import HuggingFacePipeline
	from transformers import AutoTokenizer
	from torch import cuda, bfloat16
	import langchain
	langchain.verbose = False


	def get_pdf_text(pdf_docs):
	text = ""
	for pdf in pdf_docs:
	pdf_reader = PdfReader(pdf)
	for page in pdf_reader.pages:
	text += page.extract_text()
	return text

	def get_text_chunks(text):
	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size=500, # the character length of the chunck
	chunk_overlap=100, # the character length of the overlap between chuncks
	length_function=len # the length function - in this case, character length (aka the python len() fn.)
	)
	chunks = text_splitter.split_text(text)
	return chunks

	def get_vectorstore(text_chunks,selected_embedding):
	print('Selected Embedding: ' + selected_embedding)
	if selected_embedding == 'OpenAI':
	embeddings = OpenAIEmbeddings()
	elif selected_embedding == 'Instructor-xl':

	embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
	elif selected_embedding == 'Cohere-multilingual-v3.0':
	embeddings = CohereEmbeddings(model="embed-multilingual-v3.0")

	vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
	vectorstore.save_local("faiss_index")
	return vectorstore

	def load_vectorstore(text_chunks,selected_embedding):
	print('Selected Embedding: ' + selected_embedding)
	if selected_embedding == 'OpenAI':
	embeddings = OpenAIEmbeddings()
	elif selected_embedding == 'Instructor-xl':
	embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
	vectorstore = FAISS.load_local("faiss_index", embeddings)
	elif selected_embedding == 'Cohere-multilingual-v3.0':
	embeddings = CohereEmbeddings(model="embed-multilingual-v3.0")

	vectorstore = FAISS.load_local("faiss_index", embeddings)
	return vectorstore

	def get_conversation_chain(vectorstore,selected_llm,selected_temperature):
	print('Selected LLM: ' + selected_llm)
	print('Selected Temperature: ' + str(selected_temperature))

	if selected_llm == 'GPT 3.5':
	#openai_model = "gpt-4-turbo-preview"
	openai_model = "gpt-3.5-turbo"
	llm = ChatOpenAI(model=openai_model,temperature=selected_temperature)
	elif selected_llm == 'Llama2 local':

	model_id = 'meta-llama/Llama-2-7b-chat-hf'
	hf_auth = os.environ.get("HUGGINGFACEHUB_API_TOKEN")

	model_config = transformers.AutoConfig.from_pretrained(
	model_id,
	token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
	)

	device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

	if('cuda' in device):
	# set quantization configuration to load large model with less GPU memory
	# this requires the `bitsandbytes` library
	bnb_config = transformers.BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type='nf4',
	bnb_4bit_use_double_quant=True,
	bnb_4bit_compute_dtype=bfloat16
	)

	model = transformers.AutoModelForCausalLM.from_pretrained(
	model_id,
	trust_remote_code=True,
	config=model_config,
	quantization_config=bnb_config,
	device_map='auto',
	token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
	)
	else:
	model = transformers.AutoModelForCausalLM.from_pretrained(
	model_id,
	trust_remote_code=True,
	config=model_config,
	device_map='auto',
	token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
	)

	# enable evaluation mode to allow model inference
	model.eval()
	print(f"Model loaded on {device}")

	tokenizer = transformers.AutoTokenizer.from_pretrained(
	model_id,
	token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
	)

	pipeline = transformers.pipeline(
	torch_dtype=torch.float32,
	model=model,
	tokenizer=tokenizer,
	return_full_text=True, # langchain expects the full text
	task='text-generation',
	temperature=selected_temperature, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
	max_new_tokens=512, # max number of tokens to generate in the output
	repetition_penalty=1.1 # without this output begins repeating
	)

	llm = HuggingFacePipeline(pipeline=pipeline)

	elif selected_llm == 'Llama2 inference':
	llm = HuggingFaceTextGenInference(
	inference_server_url=os.environ.get("INFERENCE_URL"),
	max_new_tokens=50,
	timeout=1200,
	temperature=selected_temperature
	)

	# Generic LLM
	memory = ConversationBufferMemory(
	memory_key='chat_history', return_messages=True, output_key='answer')


	conversation_chain = ConversationalRetrievalChain.from_llm(
	llm=llm,
	retriever=vectorstore.as_retriever(),
	memory=memory,
	return_source_documents=True,
	verbose=True,
	)
	#print(conversation_chain)

	return conversation_chain


	def handle_userinput(user_question):

	#print('Question: ' + user_question)
	response = st.session_state.conversation.invoke({'question': user_question})


	anser = response.get("answer")
	sources = response.get("source_documents", [])
	#print('Answer: ' + anser)
	#print('Sources: ' + str(sources))
	with st.expander("Sources"):
	st.write(str(sources))

	st.session_state.chat_history = response['chat_history']

	for i, message in enumerate(st.session_state.chat_history):
	if i % 2 == 0:
	st.write(user_template.replace(
	"{{MSG}}", message.content), unsafe_allow_html=True)
	else:
	st.write(bot_template.replace(
	"{{MSG}}", message.content), unsafe_allow_html=True)


	def main():
	load_dotenv()
	st.set_page_config(page_title="VerAi",
	page_icon=":books:")
	st.write(css, unsafe_allow_html=True)

	if "conversation" not in st.session_state:
	st.session_state.conversation = None
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = None



	with st.sidebar:
	st.subheader("Your documents")
	pdf_docs = st.file_uploader(
	"Upload your new PDFs here and click on 'Process' or load the last upload by clicking on 'Load'", accept_multiple_files=True)

	selected_embedding = st.radio("Which Embedding?",["Cohere-multilingual-v3.0","OpenAI", "Instructor-xl"])
	selected_llm = st.radio("Which LLM?",["GPT 3.5", "Llama2 local" ,"Llama2 inference"])
	selected_temperature = st.slider('Temperature?', 0.0, 1.0, 0.1)

	if st.button("Process"):
	with st.spinner("Processing"):
	# get pdf text
	raw_text = get_pdf_text(pdf_docs)

	# get the text chunks
	text_chunks = get_text_chunks(raw_text)

	# create vector store
	vectorstore = get_vectorstore(text_chunks,selected_embedding)

	# create conversation chain
	st.session_state.conversation = get_conversation_chain(
	vectorstore,selected_llm,selected_temperature)

	if st.button("Load"):
	with st.spinner("Processing"):

	# load vector store
	vectorstore = load_vectorstore(selected_embedding,selected_embedding)

	# create conversation chain
	st.session_state.conversation = get_conversation_chain(
	vectorstore,selected_llm,selected_temperature)

	if st.session_state.conversation:
	st.header("VerAi :books:")
	user_question = st.text_input("Stel een vraag hieronder")
	# Vertel me iets over Wettelijke uren
	# wat zijn Overige verloftypes bij kpn
	if st.session_state.conversation and user_question:
	handle_userinput(user_question)

	if __name__ == '__main__':
	main()