Spaces:
Running
Running
process new dataset (#9)
Browse files- cfg.py +8 -5
- embed_documents.py +7 -3
cfg.py
CHANGED
@@ -21,18 +21,21 @@ PASSWORD = os.getenv("BUSTER_PASSWORD")
|
|
21 |
|
22 |
HUB_TOKEN = os.getenv("HUB_TOKEN")
|
23 |
REPO_ID = os.getenv("HF_DATASET")
|
24 |
-
HUB_DB_FILE = "deeplake_store.zip"
|
25 |
|
26 |
-
|
|
|
|
|
|
|
27 |
hf_hub_download(
|
28 |
repo_id=REPO_ID,
|
29 |
repo_type="dataset",
|
30 |
-
filename=
|
31 |
token=HUB_TOKEN,
|
32 |
local_dir=".",
|
33 |
)
|
34 |
|
35 |
-
extract_zip(zip_file_path=
|
36 |
|
37 |
example_questions = [
|
38 |
"What is the LLama model?",
|
@@ -71,7 +74,7 @@ A user will now submit a question. Respond 'true' if it is valid, respond 'false
|
|
71 |
},
|
72 |
},
|
73 |
retriever_cfg={
|
74 |
-
"path": "./
|
75 |
"top_k": 3,
|
76 |
"thresh": 0.7,
|
77 |
"max_tokens": 2000,
|
|
|
21 |
|
22 |
HUB_TOKEN = os.getenv("HUB_TOKEN")
|
23 |
REPO_ID = os.getenv("HF_DATASET")
|
24 |
+
# HUB_DB_FILE = "deeplake_store.zip"
|
25 |
|
26 |
+
DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "wiki_tai_langchain")
|
27 |
+
ZIP_FILE = DEEPLAKE_DATASET + ".zip"
|
28 |
+
|
29 |
+
logger.info(f"Downloading {ZIP_FILE} from hub...")
|
30 |
hf_hub_download(
|
31 |
repo_id=REPO_ID,
|
32 |
repo_type="dataset",
|
33 |
+
filename=ZIP_FILE,
|
34 |
token=HUB_TOKEN,
|
35 |
local_dir=".",
|
36 |
)
|
37 |
|
38 |
+
extract_zip(zip_file_path=ZIP_FILE, output_path=DEEPLAKE_DATASET)
|
39 |
|
40 |
example_questions = [
|
41 |
"What is the LLama model?",
|
|
|
74 |
},
|
75 |
},
|
76 |
retriever_cfg={
|
77 |
+
"path": f"./{DEEPLAKE_DATASET}",
|
78 |
"top_k": 3,
|
79 |
"thresh": 0.7,
|
80 |
"max_tokens": 2000,
|
embed_documents.py
CHANGED
@@ -2,13 +2,17 @@ import pandas as pd
|
|
2 |
from buster.documents_manager import DeepLakeDocumentsManager
|
3 |
|
4 |
if __name__ == "__main__":
|
5 |
-
vector_store_path = "
|
6 |
-
chunk_file = "data/
|
7 |
overwrite = True
|
8 |
|
9 |
df = pd.read_csv(chunk_file)
|
10 |
|
11 |
-
|
|
|
|
|
|
|
|
|
12 |
dm.batch_add(df)
|
13 |
zipped_file_path = dm.to_zip()
|
14 |
print(f"Contents zipped to: {zipped_file_path}")
|
|
|
2 |
from buster.documents_manager import DeepLakeDocumentsManager
|
3 |
|
4 |
if __name__ == "__main__":
|
5 |
+
vector_store_path = "wiki_tai_langchain"
|
6 |
+
chunk_file = "./data/wiki_tai_langchain.csv"
|
7 |
overwrite = True
|
8 |
|
9 |
df = pd.read_csv(chunk_file)
|
10 |
|
11 |
+
print(f"before drop: {len(df)}")
|
12 |
+
df = df.dropna()
|
13 |
+
print(f"after drop: {len(df)}")
|
14 |
+
|
15 |
+
dm = DeepLakeDocumentsManager(vector_store_path, overwrite=overwrite, required_columns=["url", "source", "content", "title"])
|
16 |
dm.batch_add(df)
|
17 |
zipped_file_path = dm.to_zip()
|
18 |
print(f"Contents zipped to: {zipped_file_path}")
|