BNS-IPC_CRPC / Ingest.py
nik-one's picture
Upload 4 files
3370ffa verified
raw
history blame contribute delete
No virus
2.25 kB
import ray
import logging
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from faiss import IndexFlatL2 # Assuming using L2 distance for simplicity
# Initialize Ray
ray.init()
# Set up basic configuration for logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Load documents with logging
logging.info("Loading documents...")
loader = DirectoryLoader('data', glob="./*.txt")
documents = loader.load()
# Extract text from documents and split into manageable texts with logging
logging.info("Extracting and splitting texts from documents...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
texts = []
for document in documents:
if hasattr(document, 'get_text'):
text_content = document.get_text() # Adjust according to actual method
else:
text_content = "" # Default to empty string if no text method is available
texts.extend(text_splitter.split_text(text_content))
# Define embedding function
def embedding_function(text):
embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
return embeddings_model.embed_query(text)
# Create FAISS index for embeddings
index = IndexFlatL2(768) # Dimension of embeddings, adjust as needed
# Assuming docstore as a simple dictionary to store document texts
docstore = {i: text for i, text in enumerate(texts)}
index_to_docstore_id = {i: i for i in range(len(texts))}
# Initialize FAISS
faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id)
# Process and store embeddings
logging.info("Storing embeddings in FAISS...")
for i, text in enumerate(texts):
embedding = embedding_function(text)
faiss_db.add_documents([embedding])
# Exporting the vector embeddings database with logging
logging.info("Exporting the vector embeddings database...")
faiss_db.save_local("ipc_embed_db")
# Log a message to indicate the completion of the process
logging.info("Process completed successfully.")
# Shutdown Ray after the process
ray.shutdown()