jerpint commited on
Commit
fc1544a
1 Parent(s): 7710388

process new dataset (#9)

Browse files
Files changed (2) hide show
  1. cfg.py +8 -5
  2. embed_documents.py +7 -3
cfg.py CHANGED
@@ -21,18 +21,21 @@ PASSWORD = os.getenv("BUSTER_PASSWORD")
21
 
22
  HUB_TOKEN = os.getenv("HUB_TOKEN")
23
  REPO_ID = os.getenv("HF_DATASET")
24
- HUB_DB_FILE = "deeplake_store.zip"
25
 
26
- logger.info(f"Downloading {HUB_DB_FILE} from hub...")
 
 
 
27
  hf_hub_download(
28
  repo_id=REPO_ID,
29
  repo_type="dataset",
30
- filename=HUB_DB_FILE,
31
  token=HUB_TOKEN,
32
  local_dir=".",
33
  )
34
 
35
- extract_zip(zip_file_path=HUB_DB_FILE, output_path="deeplake_store")
36
 
37
  example_questions = [
38
  "What is the LLama model?",
@@ -71,7 +74,7 @@ A user will now submit a question. Respond 'true' if it is valid, respond 'false
71
  },
72
  },
73
  retriever_cfg={
74
- "path": "./deeplake_store",
75
  "top_k": 3,
76
  "thresh": 0.7,
77
  "max_tokens": 2000,
 
21
 
22
  HUB_TOKEN = os.getenv("HUB_TOKEN")
23
  REPO_ID = os.getenv("HF_DATASET")
24
+ # HUB_DB_FILE = "deeplake_store.zip"
25
 
26
+ DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "wiki_tai_langchain")
27
+ ZIP_FILE = DEEPLAKE_DATASET + ".zip"
28
+
29
+ logger.info(f"Downloading {ZIP_FILE} from hub...")
30
  hf_hub_download(
31
  repo_id=REPO_ID,
32
  repo_type="dataset",
33
+ filename=ZIP_FILE,
34
  token=HUB_TOKEN,
35
  local_dir=".",
36
  )
37
 
38
+ extract_zip(zip_file_path=ZIP_FILE, output_path=DEEPLAKE_DATASET)
39
 
40
  example_questions = [
41
  "What is the LLama model?",
 
74
  },
75
  },
76
  retriever_cfg={
77
+ "path": f"./{DEEPLAKE_DATASET}",
78
  "top_k": 3,
79
  "thresh": 0.7,
80
  "max_tokens": 2000,
embed_documents.py CHANGED
@@ -2,13 +2,17 @@ import pandas as pd
2
  from buster.documents_manager import DeepLakeDocumentsManager
3
 
4
  if __name__ == "__main__":
5
- vector_store_path = "deeplake_store"
6
- chunk_file = "data/wiki_and_tai.csv"
7
  overwrite = True
8
 
9
  df = pd.read_csv(chunk_file)
10
 
11
- dm = DeepLakeDocumentsManager(vector_store_path, overwrite=overwrite)
 
 
 
 
12
  dm.batch_add(df)
13
  zipped_file_path = dm.to_zip()
14
  print(f"Contents zipped to: {zipped_file_path}")
 
2
  from buster.documents_manager import DeepLakeDocumentsManager
3
 
4
  if __name__ == "__main__":
5
+ vector_store_path = "wiki_tai_langchain"
6
+ chunk_file = "./data/wiki_tai_langchain.csv"
7
  overwrite = True
8
 
9
  df = pd.read_csv(chunk_file)
10
 
11
+ print(f"before drop: {len(df)}")
12
+ df = df.dropna()
13
+ print(f"after drop: {len(df)}")
14
+
15
+ dm = DeepLakeDocumentsManager(vector_store_path, overwrite=overwrite, required_columns=["url", "source", "content", "title"])
16
  dm.batch_add(df)
17
  zipped_file_path = dm.to_zip()
18
  print(f"Contents zipped to: {zipped_file_path}")