Phi2-PDF-chat

Sleeping

dinhquangson commited on Feb 8

Commit

7dbf133

•

1 Parent(s): 6fb7c86

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -17,7 +17,9 @@ from langchain.chat_models import ChatOpenAI
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
 from htmlTemplates import css, bot_template, user_template
-from langchain.llms import HuggingFaceHub
 def get_pdf_pages(pdf_docs):
@@ -108,11 +110,28 @@ def get_conversation_chain(vectorstore):
     ConversationalRetrievalChain
         A conversational retrieval chain for generating responses.
-    """
     llm = HuggingFaceHub(
         repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
         model_kwargs={"temperature": 0.5, "max_new_tokens": 1024, "max_length": 1048, "top_k": 3, "trust_remote_code": True, "torch_dtype": "auto"},
     )
     # llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
     memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
 from htmlTemplates import css, bot_template, user_template
+#from langchain.llms import HuggingFaceHub
+from llama_index.llms import LlamaCPP
 def get_pdf_pages(pdf_docs):
     ConversationalRetrievalChain
         A conversational retrieval chain for generating responses.
     llm = HuggingFaceHub(
         repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
         model_kwargs={"temperature": 0.5, "max_new_tokens": 1024, "max_length": 1048, "top_k": 3, "trust_remote_code": True, "torch_dtype": "auto"},
     )
+    """
+    llm = LlamaCPP(
+        model_url=None,  # We'll load locally.
+        # Trying small version of an already small model
+        model_path='phi-2.Q4_K_M.gguf',
+        temperature=0.1,
+        max_new_tokens=512,
+        context_window=2048,  # Phi-2 2K context window - this could be a limitation for RAG as it has to put the content into this context window
+        generate_kwargs={},
+        # set to at least 1 to use GPU
+        # This is small model and there's no indication of layers offloaded to the GPU
+        model_kwargs={"n_gpu_layers": 32},
+        messages_to_prompt=messages_to_prompt,
+        completion_to_prompt=completion_to_prompt,
+        verbose=True
+    )
     # llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
     memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)