Spaces:
Running
Running
File size: 3,635 Bytes
0b9f9a6 0f06abd 0b9f9a6 0f06abd 0b9f9a6 0f06abd 0b9f9a6 0f06abd 0b9f9a6 0f06abd 0b9f9a6 0f06abd 0b9f9a6 0f06abd 0b9f9a6 0f06abd 0b9f9a6 0f06abd 0b9f9a6 0f06abd a5371c1 0f06abd a5371c1 0f06abd a5371c1 0f06abd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import pandas as pd
import time
import os
from buster.documents_manager import DeepLakeDocumentsManager
from deeplake.core.vectorstore import VectorStore
from langchain.embeddings.openai import OpenAIEmbeddings
import numpy as np
# from openai import OpenAI
DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "ai-tutor-dataset")
DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
# df1 = pd.read_csv("./data/jobs.csv", encoding='ISO-8859-1') # or 'latin1' or 'cp1252'
# df2 = pd.read_csv("./data/hf_transformers.csv")
# df3 = pd.read_csv("./data/langchain_course.csv")
# df4 = pd.read_csv("./data/filtered_tai_v2.csv")
# df5 = pd.read_csv("./data/wiki.csv") # , encoding="ISO-8859-1")
# df6 = pd.read_csv("./data/openai.csv")
df1 = pd.read_csv("./advanced_rag_course.csv")
# print(len(df1), len(df2), len(df3), len(df4), len(df5), len(df6))
print(len(df1))
dataset_path = f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}"
# dataset_path = f"{DEEPLAKE_DATASET}"
# because wrong name
# df1['content'] = df1['cleaned_description']
# print(np.sum(df1.content.isna()), len(df1) )
dm = DeepLakeDocumentsManager(
vector_store_path=dataset_path,
overwrite=False,
required_columns=["url", "content", "source", "title"],
)
dm.batch_add(
df=df1,
batch_size=3000,
min_time_interval=60,
num_workers=32,
csv_overwrite=False,
)
# dm.batch_add(
# df=df2,
# batch_size=3000,
# min_time_interval=60,
# num_workers=32,
# csv_embeddings_filename="embeddings.csv",
# csv_errors_filename="tmp.csv",
# csv_overwrite=False,
# )
# dm.batch_add(
# df=df3,
# batch_size=3000,
# min_time_interval=60,
# num_workers=32,
# csv_embeddings_filename="embeddings.csv",
# csv_errors_filename="tmp.csv",
# csv_overwrite=False,
# )
# dm.batch_add(
# df=df4,
# batch_size=3000,
# min_time_interval=60,
# num_workers=32,
# csv_embeddings_filename="embeddings.csv",
# csv_errors_filename="tmp.csv",
# csv_overwrite=False,
# )
# dm.batch_add(
# df=df5,
# batch_size=3000,
# min_time_interval=60,
# num_workers=32,
# csv_embeddings_filename="embeddings.csv",
# csv_errors_filename="tmp.csv",
# csv_overwrite=False,
# )
# dm.batch_add(
# df=df6,
# batch_size=3000,
# min_time_interval=60,
# num_workers=32,
# csv_embeddings_filename="embeddings.csv",
# csv_overwrite=False,
# csv_errors_filename="tmp.csv",
# )
# dm.batch_add(
# df=df7,
# batch_size=3000,
# min_time_interval=60,
# num_workers=32,
# csv_embeddings_filename="embeddings.csv",
# csv_errors_filename="tmp.csv",
# csv_overwrite=False,
# )
# client = OpenAI()
# openai_embeddings = OpenAIEmbeddings()
# def get_embedding(text, model="text-embedding-ada-002"):
# # Call to OpenAI's API to create the embedding
# response = client.embeddings.create(input=[text], model=model)
# # Extract the embedding data from the response
# embedding = response.data[0].embedding
# # Convert the ndarray to a list
# if isinstance(embedding, np.ndarray):
# embedding = embedding.tolist()
# return embedding
# vs = VectorStore(
# dataset_path,
# runtime='compute_engine',
# token=os.environ['ACTIVELOOP_TOKEN']
# )
# data = vs.search(query = "select * where shape(embedding)[0] == 0")
# vs.update_embedding(embedding_source_tensor = "text",
# query = "select * where shape(embedding)[0] == 0",
# exec_option = "compute_engine",
# embedding_function=get_embedding)
# data2 = vs.search(query = "select * where shape(embedding)[0] == 0")
|