twelve
Browse files
app.py
CHANGED
@@ -78,16 +78,15 @@ def generate(image, pdfs_path, temperature=0.9, max_new_tokens=1500, top_p=0.95,
|
|
78 |
|
79 |
# load uploaded pdf file
|
80 |
loaders = [PyPDFLoader(file) for file in pdfs_path]
|
81 |
-
|
82 |
-
|
83 |
-
pages.extend(loader.load())
|
84 |
|
85 |
# split the content into chunks
|
86 |
text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
87 |
# TokenTextSplitter() can ensure the integrity of words
|
88 |
# each chunk to overlap with the previous chunk by 20 tokens
|
89 |
#texts = text_splitter.split_text(state_of_the_union)
|
90 |
-
texts = text_splitter.split_documents(
|
91 |
print("...........................................")
|
92 |
# print the first chunk
|
93 |
print("text[0]: ", texts[0])
|
@@ -98,7 +97,8 @@ def generate(image, pdfs_path, temperature=0.9, max_new_tokens=1500, top_p=0.95,
|
|
98 |
# hnsw is used for organizing the data into an efficient structure that supports rapid retrieval operations(speed up the search).
|
99 |
# cosine similarity is used for telling the hnsw algorithm how to measure the distance between vectors.
|
100 |
# by setting space to cosine space, the index will operate using cosine similarity to measuer the vectors' similarity.
|
101 |
-
vector_store = Chroma.from_texts(texts, embeddings, collection_metadata = {"hnsw:space":"cosine"}, persist_directory="stores/story_cosine" )
|
|
|
102 |
print("vector store created........................")
|
103 |
|
104 |
load_vector_store = Chroma(persist_directory="stores/story_cosine", embedding_function=embeddings)
|
|
|
78 |
|
79 |
# load uploaded pdf file
|
80 |
loaders = [PyPDFLoader(file) for file in pdfs_path]
|
81 |
+
documents = [loader.load() for loader in loaders]
|
82 |
+
|
|
|
83 |
|
84 |
# split the content into chunks
|
85 |
text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
86 |
# TokenTextSplitter() can ensure the integrity of words
|
87 |
# each chunk to overlap with the previous chunk by 20 tokens
|
88 |
#texts = text_splitter.split_text(state_of_the_union)
|
89 |
+
texts = text_splitter.split_documents(documents)
|
90 |
print("...........................................")
|
91 |
# print the first chunk
|
92 |
print("text[0]: ", texts[0])
|
|
|
97 |
# hnsw is used for organizing the data into an efficient structure that supports rapid retrieval operations(speed up the search).
|
98 |
# cosine similarity is used for telling the hnsw algorithm how to measure the distance between vectors.
|
99 |
# by setting space to cosine space, the index will operate using cosine similarity to measuer the vectors' similarity.
|
100 |
+
#vector_store = Chroma.from_texts(texts, embeddings, collection_metadata = {"hnsw:space":"cosine"}, persist_directory="stores/story_cosine" )
|
101 |
+
vector_store = Chroma.from_documents(texts, embeddings, collection_metadata = {"hnsw:space":"cosine"}, persist_directory="stores/story_cosine" )
|
102 |
print("vector store created........................")
|
103 |
|
104 |
load_vector_store = Chroma(persist_directory="stores/story_cosine", embedding_function=embeddings)
|