File size: 3,937 Bytes
5491a72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import gradio as gr
# retrievers
from langchain.chains import RetrievalQA

import textwrap
import time

import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# models
from langchain.llms import HuggingFacePipeline
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

# prompts
from langchain import PromptTemplate, LLMChain

# vector stores
from langchain.vectorstores import FAISS


def get_model(model_name):
    model_repo = 'daryl149/llama-2-7b-chat-hf'

    tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)

    model = AutoModelForCausalLM.from_pretrained(
        model_repo,
        load_in_4bit=True,
        device_map='auto',
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        trust_remote_code=True
    )  
    max_len = 2048

    return tokenizer,model,max_len


tokenizer, model, max_len = get_model("llama2-13b")



temperature = 0,
top_p = 0.95,
repetition_penalty = 1.15

pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    pad_token_id = tokenizer.eos_token_id,
    max_length = max_len,
    temperature = temperature,
    top_p = top_p,
    repetition_penalty = repetition_penalty
)

llm = HuggingFacePipeline(pipeline = pipe)




# similar passages
k = 3



embeddings_shl_path ="/content/faiss_index_shl"
embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'
### download embeddings model
embeddings = HuggingFaceInstructEmbeddings(
    model_name = embeddings_model_repo,
    model_kwargs = {"device": "cuda"}
)

### load vector DB embeddings
vectordb = FAISS.load_local(
    embeddings_shl_path,
    embeddings
)


prompt_template = """
Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Don't mention in the answer the speaker just give the answer directly.
Use only the following pieces of context to answer the question at the end.

{context}

Question: {question}
Answer:"""


PROMPT = PromptTemplate(
    template = prompt_template,
    input_variables = ["context", "question"]
)

retriever = vectordb.as_retriever(search_kwargs = {"k": 3, "search_type" : "similarity"})

qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
    retriever = retriever,
    chain_type_kwargs = {"prompt": PROMPT},
    return_source_documents = True,
    verbose = False
)

def wrap_text_preserve_newlines(text, width=700):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'])

    sources_used = ' \n'.join(
        [
            "<b> - " + source.metadata['source'].split('/')[-1][:-4] + "</b>"
            for source in llm_response['source_documents']
        ] 
    )

    ans += "\n Sand Hill Road podcast episodes based on your question : \n" + sources_used
    return ans,sources_used

def llm_ans(query):
    start = time.time()
    llm_response = qa_chain(query)
    ans,sources_used = process_llm_response(llm_response)
    end = time.time()

    time_elapsed = int(round(end - start, 0))
    time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
    return ans, sources_used ,time_elapsed_str


def predict(message, history):
    # output = message # debug mode

    output = str(llm_ans(message)[0]).replace("\n", "<br/>")
    return output

demo = gr.ChatInterface(
    predict,
    title = f' Sand Hill Road Podcast Chatbot'
)

demo.queue()
demo.launch(debug=True)