yiyii commited on
Commit
f1076bf
1 Parent(s): 1fad5db
Files changed (1) hide show
  1. app.py +5 -5
app.py CHANGED
@@ -78,16 +78,15 @@ def generate(image, pdfs_path, temperature=0.9, max_new_tokens=1500, top_p=0.95,
78
 
79
  # load uploaded pdf file
80
  loaders = [PyPDFLoader(file) for file in pdfs_path]
81
- pages = []
82
- for loader in loaders:
83
- pages.extend(loader.load())
84
 
85
  # split the content into chunks
86
  text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
87
  # TokenTextSplitter() can ensure the integrity of words
88
  # each chunk to overlap with the previous chunk by 20 tokens
89
  #texts = text_splitter.split_text(state_of_the_union)
90
- texts = text_splitter.split_documents(pages)
91
  print("...........................................")
92
  # print the first chunk
93
  print("text[0]: ", texts[0])
@@ -98,7 +97,8 @@ def generate(image, pdfs_path, temperature=0.9, max_new_tokens=1500, top_p=0.95,
98
  # hnsw is used for organizing the data into an efficient structure that supports rapid retrieval operations(speed up the search).
99
  # cosine similarity is used for telling the hnsw algorithm how to measure the distance between vectors.
100
  # by setting space to cosine space, the index will operate using cosine similarity to measuer the vectors' similarity.
101
- vector_store = Chroma.from_texts(texts, embeddings, collection_metadata = {"hnsw:space":"cosine"}, persist_directory="stores/story_cosine" )
 
102
  print("vector store created........................")
103
 
104
  load_vector_store = Chroma(persist_directory="stores/story_cosine", embedding_function=embeddings)
 
78
 
79
  # load uploaded pdf file
80
  loaders = [PyPDFLoader(file) for file in pdfs_path]
81
+ documents = [loader.load() for loader in loaders]
82
+
 
83
 
84
  # split the content into chunks
85
  text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
86
  # TokenTextSplitter() can ensure the integrity of words
87
  # each chunk to overlap with the previous chunk by 20 tokens
88
  #texts = text_splitter.split_text(state_of_the_union)
89
+ texts = text_splitter.split_documents(documents)
90
  print("...........................................")
91
  # print the first chunk
92
  print("text[0]: ", texts[0])
 
97
  # hnsw is used for organizing the data into an efficient structure that supports rapid retrieval operations(speed up the search).
98
  # cosine similarity is used for telling the hnsw algorithm how to measure the distance between vectors.
99
  # by setting space to cosine space, the index will operate using cosine similarity to measuer the vectors' similarity.
100
+ #vector_store = Chroma.from_texts(texts, embeddings, collection_metadata = {"hnsw:space":"cosine"}, persist_directory="stores/story_cosine" )
101
+ vector_store = Chroma.from_documents(texts, embeddings, collection_metadata = {"hnsw:space":"cosine"}, persist_directory="stores/story_cosine" )
102
  print("vector store created........................")
103
 
104
  load_vector_store = Chroma(persist_directory="stores/story_cosine", embedding_function=embeddings)