Spaces:

MedTiouti
/

SandHillRoadPodcast

Runtime error

App Files Files Community

SandHillRoadPodcast / app.py

Med Tiouti

Test8326832

5491a72 8 months ago

raw

history blame

No virus

3.94 kB

	import gradio as gr
	# retrievers
	from langchain.chains import RetrievalQA

	import textwrap
	import time

	import torch
	import transformers
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

	# models
	from langchain.llms import HuggingFacePipeline
	from InstructorEmbedding import INSTRUCTOR
	from langchain.embeddings import HuggingFaceInstructEmbeddings

	# prompts
	from langchain import PromptTemplate, LLMChain

	# vector stores
	from langchain.vectorstores import FAISS


	def get_model(model_name):
	model_repo = 'daryl149/llama-2-7b-chat-hf'

	tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)

	model = AutoModelForCausalLM.from_pretrained(
	model_repo,
	load_in_4bit=True,
	device_map='auto',
	torch_dtype=torch.float16,
	low_cpu_mem_usage=True,
	trust_remote_code=True
	)
	max_len = 2048

	return tokenizer,model,max_len


	tokenizer, model, max_len = get_model("llama2-13b")



	temperature = 0,
	top_p = 0.95,
	repetition_penalty = 1.15

	pipe = pipeline(
	task = "text-generation",
	model = model,
	tokenizer = tokenizer,
	pad_token_id = tokenizer.eos_token_id,
	max_length = max_len,
	temperature = temperature,
	top_p = top_p,
	repetition_penalty = repetition_penalty
	)

	llm = HuggingFacePipeline(pipeline = pipe)




	# similar passages
	k = 3



	embeddings_shl_path ="/content/faiss_index_shl"
	embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'
	### download embeddings model
	embeddings = HuggingFaceInstructEmbeddings(
	model_name = embeddings_model_repo,
	model_kwargs = {"device": "cuda"}
	)

	### load vector DB embeddings
	vectordb = FAISS.load_local(
	embeddings_shl_path,
	embeddings
	)


	prompt_template = """
	Don't try to make up an answer, if you don't know just say that you don't know.
	Answer in the same language the question was asked.
	Don't mention in the answer the speaker just give the answer directly.
	Use only the following pieces of context to answer the question at the end.

	{context}

	Question: {question}
	Answer:"""


	PROMPT = PromptTemplate(
	template = prompt_template,
	input_variables = ["context", "question"]
	)

	retriever = vectordb.as_retriever(search_kwargs = {"k": 3, "search_type" : "similarity"})

	qa_chain = RetrievalQA.from_chain_type(
	llm = llm,
	chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
	retriever = retriever,
	chain_type_kwargs = {"prompt": PROMPT},
	return_source_documents = True,
	verbose = False
	)

	def wrap_text_preserve_newlines(text, width=700):
	# Split the input text into lines based on newline characters
	lines = text.split('\n')

	# Wrap each line individually
	wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

	# Join the wrapped lines back together using newline characters
	wrapped_text = '\n'.join(wrapped_lines)

	return wrapped_text

	def process_llm_response(llm_response):
	ans = wrap_text_preserve_newlines(llm_response['result'])

	sources_used = ' \n'.join(
	[
	"<b> - " + source.metadata['source'].split('/')[-1][:-4] + "</b>"
	for source in llm_response['source_documents']
	]
	)

	ans += "\n Sand Hill Road podcast episodes based on your question : \n" + sources_used
	return ans,sources_used

	def llm_ans(query):
	start = time.time()
	llm_response = qa_chain(query)
	ans,sources_used = process_llm_response(llm_response)
	end = time.time()

	time_elapsed = int(round(end - start, 0))
	time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
	return ans, sources_used ,time_elapsed_str


	def predict(message, history):
	# output = message # debug mode

	output = str(llm_ans(message)[0]).replace("\n", "<br/>")
	return output

	demo = gr.ChatInterface(
	predict,
	title = f' Sand Hill Road Podcast Chatbot'
	)

	demo.queue()
	demo.launch(debug=True)