jerpint commited on
Commit
a3a378d
1 Parent(s): 51727c4

Update dataset link

Browse files

* point to dataset space to download data;

* update sample questions

Files changed (3) hide show
  1. cfg.py +15 -12
  2. embed_documents.py +9 -11
  3. gradio_app.py +1 -5
cfg.py CHANGED
@@ -20,23 +20,26 @@ USERNAME = os.getenv("BUSTER_USERNAME")
20
  PASSWORD = os.getenv("BUSTER_PASSWORD")
21
 
22
  HUB_TOKEN = os.getenv("HUB_TOKEN")
23
- REPO_ID = "jerpint/towardsai-buster-data"
24
  HUB_DB_FILE = "deeplake_store.zip"
25
 
26
- if os.path.exists(HUB_DB_FILE):
27
- logger.info(f"Using local {HUB_DB_FILE}...")
28
- else:
29
- logger.info(f"Downloading {HUB_DB_FILE} from hub...")
30
- hf_hub_download(
31
- repo_id=REPO_ID,
32
- repo_type="dataset",
33
- filename=HUB_DB_FILE,
34
- token=HUB_TOKEN,
35
- local_dir=".",
36
- )
37
 
38
  extract_zip(zip_file_path=HUB_DB_FILE, output_path="deeplake_store")
39
 
 
 
 
 
 
 
40
 
41
  buster_cfg = BusterConfig(
42
  validator_cfg={
 
20
  PASSWORD = os.getenv("BUSTER_PASSWORD")
21
 
22
  HUB_TOKEN = os.getenv("HUB_TOKEN")
23
+ REPO_ID = os.getenv("HF_DATASET")
24
  HUB_DB_FILE = "deeplake_store.zip"
25
 
26
+ logger.info(f"Downloading {HUB_DB_FILE} from hub...")
27
+ hf_hub_download(
28
+ repo_id=REPO_ID,
29
+ repo_type="dataset",
30
+ filename=HUB_DB_FILE,
31
+ token=HUB_TOKEN,
32
+ local_dir=".",
33
+ )
 
 
 
34
 
35
  extract_zip(zip_file_path=HUB_DB_FILE, output_path="deeplake_store")
36
 
37
+ example_questions = [
38
+ "What's the best way to get a job in AI?",
39
+ "What is prompt engineering?",
40
+ "What is generative AI?",
41
+ ]
42
+
43
 
44
  buster_cfg = BusterConfig(
45
  validator_cfg={
embed_documents.py CHANGED
@@ -1,23 +1,21 @@
1
  import openai
2
  import pandas as pd
3
- from buster.documents import DeepLakeDocumentsManager
4
 
5
  from utils import zip_contents
6
 
7
 
8
- def read_csv(filename: str):
9
- """Assumes a pre-chunked csv file is provided with expected columns."""
10
- df = pd.read_csv(filename)
11
- for col in ["url", "source", "title", "content"]:
12
- assert col in df.columns
13
- return df
14
-
15
-
16
  if __name__ == "__main__":
17
  vector_store_path = "deeplake_store"
18
- chunk_file = "data/outputs.csv"
19
  overwrite = True
20
- df = read_csv(chunk_file)
 
 
 
 
 
 
21
 
22
  dm = DeepLakeDocumentsManager(vector_store_path, overwrite=overwrite)
23
  dm.add(df)
 
1
  import openai
2
  import pandas as pd
3
+ from buster.documents_manager import DeepLakeDocumentsManager
4
 
5
  from utils import zip_contents
6
 
7
 
 
 
 
 
 
 
 
 
8
  if __name__ == "__main__":
9
  vector_store_path = "deeplake_store"
10
+ chunk_file = "data/output.csv"
11
  overwrite = True
12
+
13
+ df = pd.read_csv(chunk_file)
14
+
15
+ # some pre-processing based on the latest file provided
16
+ df["url"] = df["source"]
17
+ df["source"] = "towardsai_blog"
18
+ df = df.dropna()
19
 
20
  dm = DeepLakeDocumentsManager(vector_store_path, overwrite=overwrite)
21
  dm.add(df)
gradio_app.py CHANGED
@@ -90,11 +90,7 @@ with block:
90
  submit = gr.Button(value="Send", variant="secondary")
91
 
92
  examples = gr.Examples(
93
- examples=[
94
- "What's a genetic algorithm?",
95
- "What's PCA? What is it used for?",
96
- "How do I deal with noisy data?",
97
- ],
98
  inputs=question,
99
  )
100
 
 
90
  submit = gr.Button(value="Send", variant="secondary")
91
 
92
  examples = gr.Examples(
93
+ examples=cfg.example_questions,
 
 
 
 
94
  inputs=question,
95
  )
96