ppsingh commited on
Commit
5ded842
1 Parent(s): 857b56b

Update auditqa/doc_process.py

Browse files
Files changed (1) hide show
  1. auditqa/doc_process.py +23 -0
auditqa/doc_process.py CHANGED
@@ -50,4 +50,27 @@ def process_pdf():
50
  doc.metadata["year"] = file[-4:]
51
 
52
  all_documents[category].append(doc_processed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
 
50
  doc.metadata["year"] = file[-4:]
51
 
52
  all_documents[category].append(doc_processed)
53
+
54
+ for key, docs_processed in all_documents.items():
55
+ docs_processed = [item for sublist in docs_processed for item in sublist]
56
+ all_documents[key] = docs_processed
57
+
58
+ embeddings = HuggingFaceEmbeddings(
59
+ model_kwargs = {'device': device},
60
+ encode_kwargs = {'normalize_embeddings': True},
61
+ model_name="BAAI/bge-small-en-v1.5"
62
+ )
63
+
64
+ qdrant_collections = {}
65
+
66
+ for file,value in all_documents.items():
67
+ print("emebddings for:",file)
68
+ qdrant_collections[file] = Qdrant.from_documents(
69
+ value,
70
+ embeddings,
71
+ location=":memory:",
72
+ collection_name=file,
73
+ )
74
+ print("done")
75
+ return qdrant_collections
76