Alex5666 commited on
Commit
db3b463
1 Parent(s): 80d0cdb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -48
app.py CHANGED
@@ -1,95 +1,101 @@
1
- # Import required modules from 'langchain' for document processing, embeddings, Q&A, etc.
2
- from langchain.document_loaders import PyPDFLoader
3
- from langchain.text_splitter import RecursiveCharacterTextSplitter
4
- from langchain.vectorstores import Chroma
5
- from langchain.embeddings import OpenAIEmbeddings
6
- from langchain.chat_models import ChatOpenAI
7
- from langchain.chains import RetrievalQA
 
 
 
 
 
 
 
8
 
9
- # Importing Streamlit for creating the web app, and other necessary modules for file handling.
10
- import streamlit as st
11
- import tempfile
12
- import os
13
 
14
  # Import a handler for streaming outputs.
15
- from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
16
 
17
  # Set the title of the Streamlit web application.
18
  st.title("ChatPDF")
19
- # Create a horizontal line for better visual separation in the app.
20
  st.write("---")
21
 
22
- # Provide an input box for users to enter their OpenAI API key.
23
- openai_key = st.text_input('Enter OPEN_AI_API_KEY', type="password")
24
-
25
- # Provide a file upload widget to let users upload their PDF files.
26
  uploaded_file = st.file_uploader("Upload your PDF file!", type=['pdf'])
27
- # Another visual separation after the file uploader.
28
  st.write("---")
29
 
30
- # Define a function that converts the uploaded PDF into a document format.
31
  def pdf_to_document(uploaded_file):
32
- # Create a temporary directory to store the uploaded PDF file temporarily.
33
  temp_dir = tempfile.TemporaryDirectory()
34
- # Join the directory path with the uploaded file name to get the complete path.
35
  temp_filepath = os.path.join(temp_dir.name, uploaded_file.name)
36
 
37
- # Write the content of the uploaded file into the temporary file path.
38
  with open(temp_filepath, "wb") as f:
39
  f.write(uploaded_file.getvalue())
40
 
41
- # Use PyPDFLoader to read and split the PDF into individual pages.
42
  loader = PyPDFLoader(temp_filepath)
43
  pages = loader.load_and_split()
44
  return pages
45
 
46
- # Check if a file has been uploaded by the user.
47
  if uploaded_file is not None:
48
  # Convert the uploaded PDF into a document format.
49
  pages = pdf_to_document(uploaded_file)
50
 
51
- # Initialize a text splitter to break the document into smaller chunks.
52
  text_splitter = RecursiveCharacterTextSplitter(
53
- # Define parameters for the splitter: chunk size, overlap, etc.
54
- chunk_size = 300,
55
- chunk_overlap = 20,
56
- length_function = len
57
  )
58
- # Split the document pages into chunks.
59
  texts = text_splitter.split_documents(pages)
60
 
61
- # Initialize the OpenAIEmbeddings model for creating embeddings from texts using the provided API key.
62
- embeddings_model = OpenAIEmbeddings(openai_api_key=openai_key)
 
 
 
63
 
64
- # Load the textual chunks into Chroma after creating embeddings.
65
- db = Chroma.from_documents(texts, embeddings_model)
66
 
67
- # Define a custom handler to stream outputs to the Streamlit app.
68
  from langchain.callbacks.base import BaseCallbackHandler
69
  class StreamHandler(BaseCallbackHandler):
70
  def __init__(self, container, initial_text=""):
71
- self.container = container
72
  self.text=initial_text
73
  def on_llm_new_token(self, token: str, **kwargs) -> None:
74
- self.text+=token
75
- self.container.markdown(self.text)
76
 
77
- # Display a header for the question section of the web app.
78
  st.header("Ask the PDF a question!")
79
- # Provide an input box for users to type in their questions.
80
  question = st.text_input('Type your question')
81
 
82
- # Check if the user has clicked on the 'Ask' button.
83
  if st.button('Ask'):
84
- # Show a spinner animation while processing the user's question.
85
  with st.spinner('Processing...'):
86
- # Create a space to display the answer.
87
  chat_box = st.empty()
88
- # Initialize a handler to stream outputs.
89
  stream_hander = StreamHandler(chat_box)
90
- # Initialize the ChatOpenAI model for Q&A tasks with streaming enabled.
91
- llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=openai_key, streaming=True, callbacks=[stream_hander])
92
- # Create a RetrievalQA chain that uses the ChatOpenAI model and Chroma retriever to answer the question.
93
  qa_chain = RetrievalQA.from_chain_type(llm, retriever=db.as_retriever())
94
- # Fetch the answer to the user's question.
95
  qa_chain({"query": question})
 
1
+ # Import necessary modules for processing documents, embeddings, Q&A, etc. from 'langchain' library.
2
+ from dotenv import load_dotenv
3
+ load_dotenv() # Load environment variables from a .env file.
4
+ from langchain.document_loaders import PyPDFLoader # For loading and reading PDF documents.
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter # For splitting large texts into smaller chunks.
6
+ from langchain.vectorstores import Chroma # Vector storage system for embeddings.
7
+ from langchain.llms import CTransformers # For loading transformer models.
8
+ from InstructorEmbedding import INSTRUCTOR # Not clear without context, possibly a custom embedding.
9
+ from langchain.embeddings import HuggingFaceInstructEmbeddings # Embeddings from HuggingFace models with instructions.
10
+ from langchain.embeddings import HuggingFaceEmbeddings # General embeddings from HuggingFace models.
11
+ from langchain.embeddings import LlamaCppEmbeddings # Embeddings using the Llama model.
12
+ from langchain.chains import RetrievalQA # Q&A retrieval system.
13
+ from langchain.embeddings import OpenAIEmbeddings # Embeddings from OpenAI models.
14
+ from langchain.vectorstores import FAISS # Another vector storage system for embeddings.
15
 
16
+ # Import Streamlit for creating a web application and other necessary modules for file handling.
17
+ import streamlit as st # Main library for creating the web application.
18
+ import tempfile # For creating temporary directories and files.
19
+ import os # For handling file and directory paths.
20
 
21
  # Import a handler for streaming outputs.
22
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # For live updates in the Streamlit app.
23
 
24
  # Set the title of the Streamlit web application.
25
  st.title("ChatPDF")
26
+ # Create a visual separator in the app.
27
  st.write("---")
28
 
29
+ # Add a file uploader widget for users to upload their PDF files.
 
 
 
30
  uploaded_file = st.file_uploader("Upload your PDF file!", type=['pdf'])
31
+ # Another visual separator after the file uploader.
32
  st.write("---")
33
 
34
+ # Function to convert the uploaded PDF into a readable document format.
35
  def pdf_to_document(uploaded_file):
36
+ # Create a temporary directory for storing the uploaded PDF.
37
  temp_dir = tempfile.TemporaryDirectory()
38
+ # Get the path where the uploaded PDF will be stored temporarily.
39
  temp_filepath = os.path.join(temp_dir.name, uploaded_file.name)
40
 
41
+ # Save the uploaded PDF to the temporary path.
42
  with open(temp_filepath, "wb") as f:
43
  f.write(uploaded_file.getvalue())
44
 
45
+ # Load the PDF and split it into individual pages.
46
  loader = PyPDFLoader(temp_filepath)
47
  pages = loader.load_and_split()
48
  return pages
49
 
50
+ # Check if a user has uploaded a file.
51
  if uploaded_file is not None:
52
  # Convert the uploaded PDF into a document format.
53
  pages = pdf_to_document(uploaded_file)
54
 
55
+ # Initialize a tool to split the document into smaller textual chunks.
56
  text_splitter = RecursiveCharacterTextSplitter(
57
+ chunk_size = 300, # Define the size of each chunk.
58
+ chunk_overlap = 20, # Define how much chunks can overlap.
59
+ length_function = len # Function to determine the length of texts.
 
60
  )
61
+ # Split the document into chunks.
62
  texts = text_splitter.split_documents(pages)
63
 
64
+ ## Below are examples of different embedding techniques, but they are commented out.
65
+
66
+ # Load the desired embeddings model.
67
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
68
+ model_kwargs={'device': 'cpu'})
69
 
70
+ # Load the textual chunks into the Chroma vector store.
71
+ db = Chroma.from_documents(texts, embeddings)
72
 
73
+ # Custom handler to stream outputs live to the Streamlit application.
74
  from langchain.callbacks.base import BaseCallbackHandler
75
  class StreamHandler(BaseCallbackHandler):
76
  def __init__(self, container, initial_text=""):
77
+ self.container = container # Streamlit container to display text.
78
  self.text=initial_text
79
  def on_llm_new_token(self, token: str, **kwargs) -> None:
80
+ self.text+=token # Add new tokens to the text.
81
+ self.container.markdown(self.text) # Display the text.
82
 
83
+ # Header for the Q&A section of the web app.
84
  st.header("Ask the PDF a question!")
85
+ # Input box for users to type their questions.
86
  question = st.text_input('Type your question')
87
 
88
+ # Check if the user has pressed the 'Ask' button.
89
  if st.button('Ask'):
90
+ # Display a spinner while processing the question.
91
  with st.spinner('Processing...'):
92
+ # Space to display the answer.
93
  chat_box = st.empty()
94
+ # Initialize the handler to stream outputs.
95
  stream_hander = StreamHandler(chat_box)
96
+
97
+ # Initialize the Q&A model and chain.
98
+ llm = CTransformers(model="llama-2-7b-chat.ggmlv3.q2_K.bin", model_type="llama", callbacks=[stream_hander])
99
  qa_chain = RetrievalQA.from_chain_type(llm, retriever=db.as_retriever())
100
+ # Get the answer to the user's question.
101
  qa_chain({"query": question})