Spaces:

yiyii
/

RAG-3

Running

App Files Files Community

yiyii commited on May 14

Commit

f1076bf

•

1 Parent(s): 1fad5db

twelve

Browse files

Files changed (1) hide show

app.py +5 -5

app.py CHANGED Viewed

@@ -78,16 +78,15 @@ def generate(image, pdfs_path, temperature=0.9, max_new_tokens=1500, top_p=0.95,
     # load uploaded pdf file
     loaders = [PyPDFLoader(file) for file in pdfs_path]
-    pages = []
-    for loader in loaders:
-        pages.extend(loader.load())
     # split the content into chunks
     text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     # TokenTextSplitter() can ensure the integrity of words
     # each chunk to overlap with the previous chunk by 20 tokens
     #texts = text_splitter.split_text(state_of_the_union)
-    texts = text_splitter.split_documents(pages)
     print("...........................................")
     # print the first chunk
     print("text[0]: ", texts[0])
@@ -98,7 +97,8 @@ def generate(image, pdfs_path, temperature=0.9, max_new_tokens=1500, top_p=0.95,
     # hnsw is used for organizing the data into an efficient structure that supports rapid retrieval operations(speed up the search).
     # cosine similarity is used for telling the hnsw algorithm how to measure the distance between vectors.
     # by setting space to cosine space, the index will operate using cosine similarity to measuer the vectors' similarity.
-    vector_store = Chroma.from_texts(texts, embeddings, collection_metadata = {"hnsw:space":"cosine"}, persist_directory="stores/story_cosine" )
     print("vector store created........................")
     load_vector_store = Chroma(persist_directory="stores/story_cosine", embedding_function=embeddings)

     # load uploaded pdf file
     loaders = [PyPDFLoader(file) for file in pdfs_path]
+    documents = [loader.load() for loader in loaders]
     # split the content into chunks
     text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     # TokenTextSplitter() can ensure the integrity of words
     # each chunk to overlap with the previous chunk by 20 tokens
     #texts = text_splitter.split_text(state_of_the_union)
+    texts = text_splitter.split_documents(documents)
     print("...........................................")
     # print the first chunk
     print("text[0]: ", texts[0])
     # hnsw is used for organizing the data into an efficient structure that supports rapid retrieval operations(speed up the search).
     # cosine similarity is used for telling the hnsw algorithm how to measure the distance between vectors.
     # by setting space to cosine space, the index will operate using cosine similarity to measuer the vectors' similarity.
+    #vector_store = Chroma.from_texts(texts, embeddings, collection_metadata = {"hnsw:space":"cosine"}, persist_directory="stores/story_cosine" )
+    vector_store = Chroma.from_documents(texts, embeddings, collection_metadata = {"hnsw:space":"cosine"}, persist_directory="stores/story_cosine" )
     print("vector store created........................")
     load_vector_store = Chroma(persist_directory="stores/story_cosine", embedding_function=embeddings)