Spaces:
Running
Running
File size: 3,367 Bytes
0b9f9a6 a5371c1 0b9f9a6 a5371c1 0b9f9a6 a5371c1 0b9f9a6 a5371c1 0b9f9a6 a5371c1 0b9f9a6 a5371c1 0b9f9a6 a5371c1 0b9f9a6 a5371c1 0b9f9a6 a5371c1 0b9f9a6 a5371c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import pandas as pd
import time
import os
from buster.documents_manager import DeepLakeDocumentsManager
from deeplake.core.vectorstore import VectorStore
from langchain.embeddings.openai import OpenAIEmbeddings
# from openai import OpenAI
DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "ai-tutor-dataset")
DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
df1 = pd.read_csv("./data/llm_course.csv")
df2 = pd.read_csv("./data/hf_transformers.csv")
df3 = pd.read_csv("./data/langchain_course.csv")
df4 = pd.read_csv("./data/filtered_tai_v2.csv")
df5 = pd.read_csv("./data/wiki.csv") # , encoding="ISO-8859-1")
df6 = pd.read_csv("./data/openai.csv")
df7 = pd.read_csv("./data/activeloop.csv")
print(len(df1), len(df2), len(df3), len(df4), len(df5), len(df6), len(df7))
dataset_path = f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}"
dm = DeepLakeDocumentsManager(
vector_store_path=dataset_path,
overwrite=True,
required_columns=["url", "content", "source", "title"],
)
dm.batch_add(
df=df1,
batch_size=3000,
min_time_interval=60,
num_workers=32,
csv_embeddings_filename="embeddings.csv",
csv_errors_filename="tmp.csv",
csv_overwrite=False,
)
dm.batch_add(
df=df2,
batch_size=3000,
min_time_interval=60,
num_workers=32,
csv_embeddings_filename="embeddings.csv",
csv_errors_filename="tmp.csv",
csv_overwrite=False,
)
dm.batch_add(
df=df3,
batch_size=3000,
min_time_interval=60,
num_workers=32,
csv_embeddings_filename="embeddings.csv",
csv_errors_filename="tmp.csv",
csv_overwrite=False,
)
dm.batch_add(
df=df4,
batch_size=3000,
min_time_interval=60,
num_workers=32,
csv_embeddings_filename="embeddings.csv",
csv_errors_filename="tmp.csv",
csv_overwrite=False,
)
dm.batch_add(
df=df5,
batch_size=3000,
min_time_interval=60,
num_workers=32,
csv_embeddings_filename="embeddings.csv",
csv_errors_filename="tmp.csv",
csv_overwrite=False,
)
dm.batch_add(
df=df6,
batch_size=3000,
min_time_interval=60,
num_workers=32,
csv_embeddings_filename="embeddings.csv",
csv_overwrite=False,
csv_errors_filename="tmp.csv",
)
dm.batch_add(
df=df7,
batch_size=3000,
min_time_interval=60,
num_workers=32,
csv_embeddings_filename="embeddings.csv",
csv_errors_filename="tmp.csv",
csv_overwrite=False,
)
# client = OpenAI()
# openai_embeddings = OpenAIEmbeddings()
# def get_embedding(text, model="text-embedding-ada-002"):
# # Call to OpenAI's API to create the embedding
# response = client.embeddings.create(input=[text], model=model)
# # Extract the embedding data from the response
# embedding = response.data[0].embedding
# # Convert the ndarray to a list
# if isinstance(embedding, np.ndarray):
# embedding = embedding.tolist()
# return embedding
# vs = VectorStore(
# dataset_path,
# runtime='compute_engine',
# token=os.environ['ACTIVELOOP_TOKEN']
# )
# data = vs.search(query = "select * where shape(embedding)[0] == 0")
# vs.update_embedding(embedding_source_tensor = "text",
# query = "select * where shape(embedding)[0] == 0",
# exec_option = "compute_engine",
# embedding_function=get_embedding)
# data2 = vs.search(query = "select * where shape(embedding)[0] == 0")
|