import pandas as pd import time import os from buster.documents_manager import DeepLakeDocumentsManager from deeplake.core.vectorstore import VectorStore from langchain.embeddings.openai import OpenAIEmbeddings import numpy as np # from openai import OpenAI DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "ai-tutor-dataset") DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai") # df1 = pd.read_csv("./data/jobs.csv", encoding='ISO-8859-1') # or 'latin1' or 'cp1252' # df2 = pd.read_csv("./data/hf_transformers.csv") # df3 = pd.read_csv("./data/langchain_course.csv") # df4 = pd.read_csv("./data/filtered_tai_v2.csv") # df5 = pd.read_csv("./data/wiki.csv") # , encoding="ISO-8859-1") # df6 = pd.read_csv("./data/openai.csv") df1 = pd.read_csv("./advanced_rag_course.csv") # print(len(df1), len(df2), len(df3), len(df4), len(df5), len(df6)) print(len(df1)) dataset_path = f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}" # dataset_path = f"{DEEPLAKE_DATASET}" # because wrong name # df1['content'] = df1['cleaned_description'] # print(np.sum(df1.content.isna()), len(df1) ) dm = DeepLakeDocumentsManager( vector_store_path=dataset_path, overwrite=False, required_columns=["url", "content", "source", "title"], ) dm.batch_add( df=df1, batch_size=3000, min_time_interval=60, num_workers=32, csv_overwrite=False, ) # dm.batch_add( # df=df2, # batch_size=3000, # min_time_interval=60, # num_workers=32, # csv_embeddings_filename="embeddings.csv", # csv_errors_filename="tmp.csv", # csv_overwrite=False, # ) # dm.batch_add( # df=df3, # batch_size=3000, # min_time_interval=60, # num_workers=32, # csv_embeddings_filename="embeddings.csv", # csv_errors_filename="tmp.csv", # csv_overwrite=False, # ) # dm.batch_add( # df=df4, # batch_size=3000, # min_time_interval=60, # num_workers=32, # csv_embeddings_filename="embeddings.csv", # csv_errors_filename="tmp.csv", # csv_overwrite=False, # ) # dm.batch_add( # df=df5, # batch_size=3000, # min_time_interval=60, # num_workers=32, # csv_embeddings_filename="embeddings.csv", # csv_errors_filename="tmp.csv", # csv_overwrite=False, # ) # dm.batch_add( # df=df6, # batch_size=3000, # min_time_interval=60, # num_workers=32, # csv_embeddings_filename="embeddings.csv", # csv_overwrite=False, # csv_errors_filename="tmp.csv", # ) # dm.batch_add( # df=df7, # batch_size=3000, # min_time_interval=60, # num_workers=32, # csv_embeddings_filename="embeddings.csv", # csv_errors_filename="tmp.csv", # csv_overwrite=False, # ) # client = OpenAI() # openai_embeddings = OpenAIEmbeddings() # def get_embedding(text, model="text-embedding-ada-002"): # # Call to OpenAI's API to create the embedding # response = client.embeddings.create(input=[text], model=model) # # Extract the embedding data from the response # embedding = response.data[0].embedding # # Convert the ndarray to a list # if isinstance(embedding, np.ndarray): # embedding = embedding.tolist() # return embedding # vs = VectorStore( # dataset_path, # runtime='compute_engine', # token=os.environ['ACTIVELOOP_TOKEN'] # ) # data = vs.search(query = "select * where shape(embedding)[0] == 0") # vs.update_embedding(embedding_source_tensor = "text", # query = "select * where shape(embedding)[0] == 0", # exec_option = "compute_engine", # embedding_function=get_embedding) # data2 = vs.search(query = "select * where shape(embedding)[0] == 0")