File size: 2,911 Bytes
0f06abd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pandas as pd
import time
import os
from deeplake.core.vectorstore import VectorStore
from langchain.embeddings.openai import OpenAIEmbeddings
import logging

from buster.documents_manager import DeepLakeDocumentsManager
from buster.llm_utils import get_openai_embedding_constructor

# Set the logging level of `httpx` to WARNING or higher to suppress annoying INFO logs
logging.getLogger("httpx").setLevel(logging.WARNING)

openai_embedding_fn = get_openai_embedding_constructor(
    client_kwargs={"max_retries": 10}
)

# from openai import OpenAI

DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "ai-tutor-dataset")
DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")

df1 = pd.read_csv("./data/langchain.csv")  # or 'latin1' or 'cp1252'
df2 = pd.read_csv("./data/hf_transformers.csv")
df3 = pd.read_csv("./data/langchain_course.csv")
df4 = pd.read_csv("./data/filtered_tai_v2.csv")
df5 = pd.read_csv("./data/wiki.csv")  # , encoding="ISO-8859-1")
df6 = pd.read_csv("./data/openai.csv")
df7 = pd.read_csv("./data/activeloop.csv")
df8 = pd.read_csv("./data/llm_course.csv")

print(
    f"Number of samples: {len(df1)},{len(df2)},{len(df3)},{len(df4)},{len(df5)},{len(df6)},{len(df7)},{len(df8)}"
)

dataset_path = f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}"

dm = DeepLakeDocumentsManager(
    vector_store_path=dataset_path,
    overwrite=True,
)

dm.batch_add(
    df=df1,
    batch_size=3000,
    min_time_interval=60,
    num_workers=32,
    embedding_fn=openai_embedding_fn,
    csv_filename="embeddings.csv",
    csv_overwrite=False,
)

dm.batch_add(
    df=df2,
    batch_size=3000,
    min_time_interval=60,
    num_workers=32,
    embedding_fn=openai_embedding_fn,
    csv_filename="embeddings.csv",
    csv_overwrite=False,
)

dm.batch_add(
    df=df3,
    batch_size=3000,
    min_time_interval=60,
    num_workers=32,
    embedding_fn=openai_embedding_fn,
    csv_filename="embeddings.csv",
    csv_overwrite=False,
)

dm.batch_add(
    df=df4,
    batch_size=3000,
    min_time_interval=60,
    num_workers=32,
    embedding_fn=openai_embedding_fn,
    csv_filename="embeddings.csv",
    csv_overwrite=False,
)

dm.batch_add(
    df=df5,
    batch_size=3000,
    min_time_interval=60,
    num_workers=32,
    embedding_fn=openai_embedding_fn,
    csv_filename="embeddings.csv",
    csv_overwrite=False,
)

dm.batch_add(
    df=df6,
    batch_size=3000,
    min_time_interval=60,
    num_workers=32,
    embedding_fn=openai_embedding_fn,
    csv_filename="embeddings.csv",
    csv_overwrite=False,
)

dm.batch_add(
    df=df7,
    batch_size=3000,
    min_time_interval=60,
    num_workers=32,
    embedding_fn=openai_embedding_fn,
    csv_filename="embeddings.csv",
    csv_overwrite=False,
)

dm.batch_add(
    df=df8,
    batch_size=3000,
    min_time_interval=60,
    num_workers=32,
    embedding_fn=openai_embedding_fn,
    csv_filename="embeddings.csv",
    csv_overwrite=False,
)