Bazsalanszky commited on
Commit
5c5f47c
1 Parent(s): b89cf4a

Test commit

Browse files
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
- import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
 
5
  from huggingface_hub import snapshot_download
6
 
7
  from src.about import (
@@ -20,9 +19,9 @@ from src.display.utils import (
20
  EVAL_TYPES,
21
  AutoEvalColumn,
22
  ModelType,
23
- fields,
24
  WeightType,
25
- Precision
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
@@ -32,18 +31,29 @@ from src.submission.submit import add_new_eval
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
 
35
  ### Space initialisation
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
40
  )
41
  except Exception:
42
  restart_space()
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
  snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
47
  )
48
  except Exception:
49
  restart_space()
@@ -57,6 +67,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -80,9 +91,7 @@ def init_leaderboard(dataframe):
80
  max=150,
81
  label="Select the number of parameters (B)",
82
  ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
@@ -201,4 +210,4 @@ with demo:
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import gradio as gr
 
 
2
  from apscheduler.schedulers.background import BackgroundScheduler
3
+ from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
4
  from huggingface_hub import snapshot_download
5
 
6
  from src.about import (
 
19
  EVAL_TYPES,
20
  AutoEvalColumn,
21
  ModelType,
22
+ Precision,
23
  WeightType,
24
+ fields,
25
  )
26
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
27
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
31
  def restart_space():
32
  API.restart_space(repo_id=REPO_ID)
33
 
34
+
35
  ### Space initialisation
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
39
+ repo_id=QUEUE_REPO,
40
+ local_dir=EVAL_REQUESTS_PATH,
41
+ repo_type="dataset",
42
+ tqdm_class=None,
43
+ etag_timeout=30,
44
+ token=TOKEN,
45
  )
46
  except Exception:
47
  restart_space()
48
  try:
49
  print(EVAL_RESULTS_PATH)
50
  snapshot_download(
51
+ repo_id=RESULTS_REPO,
52
+ local_dir=EVAL_RESULTS_PATH,
53
+ repo_type="dataset",
54
+ tqdm_class=None,
55
+ etag_timeout=30,
56
+ token=TOKEN,
57
  )
58
  except Exception:
59
  restart_space()
 
67
  pending_eval_queue_df,
68
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
69
 
70
+
71
  def init_leaderboard(dataframe):
72
  if dataframe is None or dataframe.empty:
73
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
91
  max=150,
92
  label="Select the number of parameters (B)",
93
  ),
94
+ ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
 
 
95
  ],
96
  bool_checkboxgroup_label="Hide models",
97
  interactive=False,
 
210
  scheduler = BackgroundScheduler()
211
  scheduler.add_job(restart_space, "interval", seconds=1800)
212
  scheduler.start()
213
+ demo.queue(default_concurrency_limit=40).launch()
src/about.py CHANGED
@@ -1,6 +1,7 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
4
  @dataclass
5
  class Task:
6
  benchmark: str
@@ -11,7 +12,7 @@ class Task:
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  task0 = Task("category_mean", "history", "History")
16
  task1 = Task("category_mean", "grammar", "Grammar")
17
  task2 = Task("category_mean", "logic", "Logic")
@@ -19,9 +20,9 @@ class Tasks(Enum):
19
  task4 = Task("category_mean", "spelling", "Spelling")
20
  task5 = Task("category_mean", "Vocabulary", "Vocabulary")
21
 
22
- NUM_FEWSHOT = 0 # Change with your few shot
23
- # ---------------------------------------------------
24
 
 
 
25
 
26
 
27
  # Your leaderboard name
@@ -33,7 +34,7 @@ This leaderboard evaluates the performance of models on the HunBench benchmark.
33
  """
34
 
35
  # Which evaluations are you running? how can people reproduce what you have?
36
- LLM_BENCHMARKS_TEXT = f"""
37
  ## How it works
38
  TODO
39
  ## Reproducibility
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
  @dataclass
6
  class Task:
7
  benchmark: str
 
12
  # Select your tasks here
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
  task0 = Task("category_mean", "history", "History")
17
  task1 = Task("category_mean", "grammar", "Grammar")
18
  task2 = Task("category_mean", "logic", "Logic")
 
20
  task4 = Task("category_mean", "spelling", "Spelling")
21
  task5 = Task("category_mean", "Vocabulary", "Vocabulary")
22
 
 
 
23
 
24
+ NUM_FEWSHOT = 0 # Change with your few shot
25
+ # ---------------------------------------------------
26
 
27
 
28
  # Your leaderboard name
 
34
  """
35
 
36
  # Which evaluations are you running? how can people reproduce what you have?
37
+ LLM_BENCHMARKS_TEXT = """
38
  ## How it works
39
  TODO
40
  ## Reproducibility
src/display/utils.py CHANGED
@@ -27,7 +27,7 @@ auto_eval_column_dict = []
27
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
29
  # Scores
30
- auto_eval_column_dict.append(["mean_score", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
31
  for task in Tasks:
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
  # Model information
 
27
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
29
  # Scores
30
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
31
  for task in Tasks:
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
  # Model information
src/leaderboard/read_evals.py CHANGED
@@ -1,6 +1,5 @@
1
  import glob
2
  import json
3
- import math
4
  import os
5
  from dataclasses import dataclass
6
 
@@ -8,28 +7,28 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
  @dataclass
16
  class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
  model: str
23
- revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
32
- date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
35
  @classmethod
@@ -85,10 +84,10 @@ class EvalResult:
85
  org=org,
86
  model=model,
87
  results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
- architecture=architecture
92
  )
93
 
94
  def update_with_request_file(self, requests_path):
@@ -105,7 +104,9 @@ class EvalResult:
105
  self.num_params = request.get("params", 0)
106
  self.date = request.get("submitted_time", "")
107
  except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
 
 
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -146,10 +147,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
146
  for tmp_request_file in request_files:
147
  with open(tmp_request_file, "r") as f:
148
  req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
  request_file = tmp_request_file
154
  return request_file
155
 
@@ -188,7 +186,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
188
  results = []
189
  for v in eval_results.values():
190
  try:
191
- v.to_dict() # we test if the dict version is complete
192
  results.append(v)
193
  except KeyError: # not all eval values present
194
  continue
 
1
  import glob
2
  import json
 
3
  import os
4
  from dataclasses import dataclass
5
 
 
7
  import numpy as np
8
 
9
  from src.display.formatting import make_clickable_model
10
+ from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
11
  from src.submission.check_validity import is_model_on_hub
12
 
13
 
14
  @dataclass
15
  class EvalResult:
16
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
17
+
18
+ eval_name: str # org_model_precision (uid)
19
+ full_model: str # org/model (path on hub)
20
+ org: str
21
  model: str
22
+ revision: str # commit hash, "" if main
23
  results: dict
24
  precision: Precision = Precision.Unknown
25
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
26
+ weight_type: WeightType = WeightType.Original # Original or Adapter
27
+ architecture: str = "Unknown"
28
  license: str = "?"
29
  likes: int = 0
30
  num_params: int = 0
31
+ date: str = "" # submission date of request file
32
  still_on_hub: bool = False
33
 
34
  @classmethod
 
84
  org=org,
85
  model=model,
86
  results=results,
87
+ precision=precision,
88
+ revision=config.get("model_sha", ""),
89
  still_on_hub=still_on_hub,
90
+ architecture=architecture,
91
  )
92
 
93
  def update_with_request_file(self, requests_path):
 
104
  self.num_params = request.get("params", 0)
105
  self.date = request.get("submitted_time", "")
106
  except Exception:
107
+ print(
108
+ f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
109
+ )
110
 
111
  def to_dict(self):
112
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
147
  for tmp_request_file in request_files:
148
  with open(tmp_request_file, "r") as f:
149
  req_content = json.load(f)
150
+ if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
 
 
 
151
  request_file = tmp_request_file
152
  return request_file
153
 
 
186
  results = []
187
  for v in eval_results.values():
188
  try:
189
+ v.to_dict() # we test if the dict version is complete
190
  results.append(v)
191
  except KeyError: # not all eval values present
192
  continue
src/populate.py CHANGED
@@ -14,7 +14,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.mean_score.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
 
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
src/submission/check_validity.py CHANGED
@@ -1,8 +1,6 @@
1
  import json
2
  import os
3
- import re
4
  from collections import defaultdict
5
- from datetime import datetime, timedelta, timezone
6
 
7
  import huggingface_hub
8
  from huggingface_hub import ModelCard
@@ -10,6 +8,7 @@ from huggingface_hub.hf_api import ModelInfo
10
  from transformers import AutoConfig
11
  from transformers.models.auto.tokenization_auto import AutoTokenizer
12
 
 
13
  def check_model_card(repo_id: str) -> tuple[bool, str]:
14
  """Checks if the model card and license exist and have been filled"""
15
  try:
@@ -31,31 +30,38 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
31
 
32
  return True, ""
33
 
34
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
 
 
 
35
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
  try:
37
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
 
38
  if test_tokenizer:
39
  try:
40
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
 
41
  except ValueError as e:
 
 
42
  return (
43
  False,
44
- f"uses a tokenizer which is not in a transformers release: {e}",
45
- None
46
  )
47
- except Exception as e:
48
- return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
  return True, None, config
50
 
51
  except ValueError:
52
  return (
53
  False,
54
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
- None
56
  )
57
 
58
- except Exception as e:
59
  return False, "was not found on hub!", None
60
 
61
 
@@ -70,10 +76,12 @@ def get_model_size(model_info: ModelInfo, precision: str):
70
  model_size = size_factor * model_size
71
  return model_size
72
 
 
73
  def get_model_arch(model_info: ModelInfo):
74
  """Gets the model architecture from the configuration"""
75
  return model_info.config.get("architectures", "Unknown")
76
 
 
77
  def already_submitted_models(requested_models_dir: str) -> set[str]:
78
  """Gather a list of already submitted models to avoid duplicates"""
79
  depth = 1
 
1
  import json
2
  import os
 
3
  from collections import defaultdict
 
4
 
5
  import huggingface_hub
6
  from huggingface_hub import ModelCard
 
8
  from transformers import AutoConfig
9
  from transformers.models.auto.tokenization_auto import AutoTokenizer
10
 
11
+
12
  def check_model_card(repo_id: str) -> tuple[bool, str]:
13
  """Checks if the model card and license exist and have been filled"""
14
  try:
 
30
 
31
  return True, ""
32
 
33
+
34
+ def is_model_on_hub(
35
+ model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
36
+ ) -> tuple[bool, str]:
37
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
38
  try:
39
+ config = AutoConfig.from_pretrained(
40
+ model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
41
+ )
42
  if test_tokenizer:
43
  try:
44
+ tk = AutoTokenizer.from_pretrained(
45
+ model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
46
+ )
47
  except ValueError as e:
48
+ return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
49
+ except Exception:
50
  return (
51
  False,
52
+ "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
53
+ None,
54
  )
 
 
55
  return True, None, config
56
 
57
  except ValueError:
58
  return (
59
  False,
60
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
61
+ None,
62
  )
63
 
64
+ except Exception:
65
  return False, "was not found on hub!", None
66
 
67
 
 
76
  model_size = size_factor * model_size
77
  return model_size
78
 
79
+
80
  def get_model_arch(model_info: ModelInfo):
81
  """Gets the model architecture from the configuration"""
82
  return model_info.config.get("architectures", "Unknown")
83
 
84
+
85
  def already_submitted_models(requested_models_dir: str) -> set[str]:
86
  """Gather a list of already submitted models to avoid duplicates"""
87
  depth = 1
src/submission/submit.py CHANGED
@@ -3,17 +3,13 @@ import os
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
- from src.submission.check_validity import (
8
- already_submitted_models,
9
- check_model_card,
10
- get_model_size,
11
- is_model_on_hub,
12
- )
13
 
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
16
 
 
17
  def add_new_eval(
18
  model: str,
19
  base_model: str,
@@ -45,7 +41,9 @@ def add_new_eval(
45
 
46
  # Is the model on the hub?
47
  if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
 
 
49
  if not base_model_on_hub:
50
  return styled_error(f'Base model "{base_model}" {error}')
51
 
 
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
+ from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
7
+ from src.submission.check_validity import already_submitted_models, check_model_card, get_model_size, is_model_on_hub
 
 
 
 
 
8
 
9
  REQUESTED_MODELS = None
10
  USERS_TO_SUBMISSION_DATES = None
11
 
12
+
13
  def add_new_eval(
14
  model: str,
15
  base_model: str,
 
41
 
42
  # Is the model on the hub?
43
  if weight_type in ["Delta", "Adapter"]:
44
+ base_model_on_hub, error, _ = is_model_on_hub(
45
+ model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True
46
+ )
47
  if not base_model_on_hub:
48
  return styled_error(f'Base model "{base_model}" {error}')
49