yichao commited on
Commit
9c9b7f5
β€’
1 Parent(s): d04ff35

update mj-bench

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. app.py +11 -2
  2. evals/.gitattributes +55 -0
  3. evals/README.md +6 -0
  4. evals/{mjbench β†’ mjbench-results}/detailed-results/AestheticsPredictor.json +0 -0
  5. evals/{mjbench β†’ mjbench-results}/detailed-results/BLIP-v2.json +0 -0
  6. evals/{mjbench β†’ mjbench-results}/detailed-results/CLIP-v2.json +0 -0
  7. evals/{mjbench β†’ mjbench-results}/detailed-results/Claude 3 Opus.json +0 -0
  8. evals/{mjbench β†’ mjbench-results}/detailed-results/GPT-4-vision.json +0 -0
  9. evals/{mjbench β†’ mjbench-results}/detailed-results/GPT-4o.json +0 -0
  10. evals/{mjbench β†’ mjbench-results}/detailed-results/Gemini Ultra.json +0 -0
  11. evals/{mjbench β†’ mjbench-results}/detailed-results/HPS-v2.1.json +0 -0
  12. evals/{mjbench β†’ mjbench-results}/detailed-results/Idefics2-8b.json +0 -0
  13. evals/{mjbench β†’ mjbench-results}/detailed-results/ImageReward.json +0 -0
  14. evals/{mjbench β†’ mjbench-results}/detailed-results/Instructblip-7b.json +0 -0
  15. evals/{mjbench β†’ mjbench-results}/detailed-results/InternVL-Chat-V1-5.json +0 -0
  16. evals/{mjbench β†’ mjbench-results}/detailed-results/LLaVA-1.5-13b.json +0 -0
  17. evals/{mjbench β†’ mjbench-results}/detailed-results/LLaVA-1.5-7b.json +0 -0
  18. evals/{mjbench β†’ mjbench-results}/detailed-results/LLaVA-NeXT-mistral-7b.json +0 -0
  19. evals/{mjbench β†’ mjbench-results}/detailed-results/LLaVA-NeXT-vicuna-13b.json +0 -0
  20. evals/{mjbench β†’ mjbench-results}/detailed-results/MiniGPT4-v2.json +0 -0
  21. evals/{mjbench β†’ mjbench-results}/detailed-results/PickScore-v1.json +0 -0
  22. evals/{mjbench β†’ mjbench-results}/detailed-results/Prometheus-Vision-13b.json +0 -0
  23. evals/{mjbench β†’ mjbench-results}/detailed-results/Prometheus-Vision-7b.json +0 -0
  24. evals/{mjbench β†’ mjbench-results}/detailed-results/Qwen-VL-Chat.json +0 -0
  25. evals/{mjbench β†’ mjbench-results}/overall-results/AestheticsPredictor.json +0 -0
  26. evals/{mjbench β†’ mjbench-results}/overall-results/BLIP-v2.json +0 -0
  27. evals/{mjbench β†’ mjbench-results}/overall-results/CLIP-v2.json +0 -0
  28. evals/{mjbench β†’ mjbench-results}/overall-results/Claude 3 Opus.json +0 -0
  29. evals/{mjbench β†’ mjbench-results}/overall-results/GPT-4-vision.json +0 -0
  30. evals/{mjbench β†’ mjbench-results}/overall-results/GPT-4o.json +0 -0
  31. evals/{mjbench β†’ mjbench-results}/overall-results/Gemini Ultra.json +0 -0
  32. evals/{mjbench β†’ mjbench-results}/overall-results/HPS-v2.1.json +0 -0
  33. evals/{mjbench β†’ mjbench-results}/overall-results/Idefics2-8b.json +0 -0
  34. evals/{mjbench β†’ mjbench-results}/overall-results/ImageReward.json +0 -0
  35. evals/{mjbench β†’ mjbench-results}/overall-results/Instructblip-7b.json +0 -0
  36. evals/{mjbench β†’ mjbench-results}/overall-results/InternVL-Chat-V1-5.json +0 -0
  37. evals/{mjbench β†’ mjbench-results}/overall-results/LLaVA-1.5-13b.json +0 -0
  38. evals/{mjbench β†’ mjbench-results}/overall-results/LLaVA-1.5-7b.json +0 -0
  39. evals/{mjbench β†’ mjbench-results}/overall-results/LLaVA-NeXT-mistral-7b.json +0 -0
  40. evals/{mjbench β†’ mjbench-results}/overall-results/LLaVA-NeXT-vicuna-13b.json +0 -0
  41. evals/{mjbench β†’ mjbench-results}/overall-results/MiniGPT4-v2.json +0 -0
  42. evals/{mjbench β†’ mjbench-results}/overall-results/PickScore-v1.json +0 -0
  43. evals/{mjbench β†’ mjbench-results}/overall-results/Prometheus-Vision-13b.json +0 -0
  44. evals/{mjbench β†’ mjbench-results}/overall-results/Prometheus-Vision-7b.json +0 -0
  45. evals/{mjbench β†’ mjbench-results}/overall-results/Qwen-VL-Chat.json +0 -0
  46. evals/mjbench/latex_reults/alignment_narrative.tex +0 -37
  47. evals/mjbench/latex_reults/alignment_number_10.tex +0 -29
  48. evals/mjbench/latex_reults/alignment_number_5.tex +0 -35
  49. evals/mjbench/latex_reults/artifact_narrative.tex +0 -29
  50. evals/mjbench/latex_reults/artifact_number_10.tex +0 -38
app.py CHANGED
@@ -35,6 +35,14 @@ from src.display.utils import (
35
  )
36
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
37
 
 
 
 
 
 
 
 
 
38
  SUBSET_COUNTS = {
39
  "Alignment-Object": 250,
40
  "Alignment-Attribute": 229,
@@ -71,6 +79,7 @@ PERSPECTIVE_COUNTS= {
71
  META_DATA = ['Model', 'Model Type', 'Input Type', 'Organization']
72
 
73
 
 
74
  def restart_space():
75
  API.restart_space(repo_id=REPO_ID)
76
 
@@ -192,12 +201,12 @@ def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=MET
192
  return new_df
193
 
194
 
195
- results_path = Path("./evals/mjbench/eval-results")
196
  orig_df = get_leaderboard_results(results_path)
197
  colmuns_name = list(SUBSET_COUNTS.keys())
198
  detailed_df = avg_all_subset(orig_df, colmuns_name).round(2)
199
 
200
- results_path = Path("./evals/mjbench/overall-results")
201
  orig_df = get_leaderboard_results(results_path)
202
  colmuns_name = list(PERSPECTIVE_COUNTS.keys())
203
  perspective_df = avg_all_perspective(orig_df, colmuns_name).round(2)
 
35
  )
36
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
37
 
38
+ try:
39
+ print(EVAL_RESULTS_PATH)
40
+ snapshot_download(
41
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
42
+ )
43
+ except Exception:
44
+ restart_space()
45
+
46
  SUBSET_COUNTS = {
47
  "Alignment-Object": 250,
48
  "Alignment-Attribute": 229,
 
79
  META_DATA = ['Model', 'Model Type', 'Input Type', 'Organization']
80
 
81
 
82
+
83
  def restart_space():
84
  API.restart_space(repo_id=REPO_ID)
85
 
 
201
  return new_df
202
 
203
 
204
+ results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/detailed-results")
205
  orig_df = get_leaderboard_results(results_path)
206
  colmuns_name = list(SUBSET_COUNTS.keys())
207
  detailed_df = avg_all_subset(orig_df, colmuns_name).round(2)
208
 
209
+ results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/overall-results")
210
  orig_df = get_leaderboard_results(results_path)
211
  colmuns_name = list(PERSPECTIVE_COUNTS.keys())
212
  perspective_df = avg_all_perspective(orig_df, colmuns_name).round(2)
evals/.gitattributes ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar filter=lfs diff=lfs merge=lfs -text
30
+ *.tflite filter=lfs diff=lfs merge=lfs -text
31
+ *.tgz filter=lfs diff=lfs merge=lfs -text
32
+ *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.xz filter=lfs diff=lfs merge=lfs -text
34
+ *.zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zst filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
evals/README.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ---
2
+ # For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1
3
+ # Doc / guide: https://huggingface.co/docs/hub/datasets-cards
4
+ {}
5
+ ---
6
+ # Coming Soon
evals/{mjbench β†’ mjbench-results}/detailed-results/AestheticsPredictor.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/BLIP-v2.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/CLIP-v2.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/Claude 3 Opus.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/GPT-4-vision.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/GPT-4o.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/Gemini Ultra.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/HPS-v2.1.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/Idefics2-8b.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/ImageReward.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/Instructblip-7b.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/InternVL-Chat-V1-5.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/LLaVA-1.5-13b.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/LLaVA-1.5-7b.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/LLaVA-NeXT-mistral-7b.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/LLaVA-NeXT-vicuna-13b.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/MiniGPT4-v2.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/PickScore-v1.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/Prometheus-Vision-13b.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/Prometheus-Vision-7b.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/detailed-results/Qwen-VL-Chat.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/AestheticsPredictor.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/BLIP-v2.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/CLIP-v2.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/Claude 3 Opus.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/GPT-4-vision.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/GPT-4o.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/Gemini Ultra.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/HPS-v2.1.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/Idefics2-8b.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/ImageReward.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/Instructblip-7b.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/InternVL-Chat-V1-5.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/LLaVA-1.5-13b.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/LLaVA-1.5-7b.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/LLaVA-NeXT-mistral-7b.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/LLaVA-NeXT-vicuna-13b.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/MiniGPT4-v2.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/PickScore-v1.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/Prometheus-Vision-13b.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/Prometheus-Vision-7b.json RENAMED
File without changes
evals/{mjbench β†’ mjbench-results}/overall-results/Qwen-VL-Chat.json RENAMED
File without changes
evals/mjbench/latex_reults/alignment_narrative.tex DELETED
@@ -1,37 +0,0 @@
1
- \begin{table}[h]
2
- \centering
3
- \caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback are provided in the following Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.}
4
- \resizebox{0.9\linewidth}{!}{%
5
- \begin{tabular}{c|cccccc}
6
- \toprule
7
- & Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg \\
8
- \midrule
9
- % CLIP-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
10
- % BLIP-v2$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
11
- % PickScore-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
12
- % HPS-v2.1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
13
- % ImageReward$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
14
- % Aesthetics$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
15
- % \midrule
16
- LLaVA-1.5-7b$^\heartsuit$ & $19.1$ & $17.8$ & $20.5$ & $16.9$ & $25.0$ & \cellcolor{skyblue} $19.2$ \\
17
- LLaVA-1.5-13b$^\heartsuit$ & $22.7$ & $21.3$ & $22.2$ & $15.6
18
- $ & $17.9$ & \cellcolor{skyblue} $21.1$ \\
19
- LLaVA-NeXT-mistral-7b$^\heartsuit$ & $19.1$ & $17.8$ & $16.2$ & $10.4$ & $12.5$ & \cellcolor{skyblue} $16.8$ \\
20
- LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $22.7$ & $21.3$ & $17.1$ & $20.8$ & $16.1$ & \cellcolor{skyblue} $20.7$ \\
21
- Instructblip-7b$^\heartsuit$ & $22.3$ & $20.9$ & $17.1
22
- $ & $15.6$ & $7.10$ & \cellcolor{skyblue} $19.2$ \\
23
- MiniGPT4-v2$^\heartsuit$ & $21.1$ & $27.0$ & $22.2$ & $23.4$ & $23.2$ & \cellcolor{skyblue} $23.5$ \\
24
- Prometheus-Vision-7b$^\heartsuit$ & $21.9$ & $17.4$ & $21.4$ & $18.2$ & $5.40$ & \cellcolor{skyblue} $18.7$ \\
25
- Prometheus-Vision-13b$^\heartsuit$ & $15.1$ & $13.9$ & $12.8$ & $11.5$ & $5.40$ & \cellcolor{skyblue} $13.3$ \\
26
- Qwen-VL-Chat$^\spadesuit$ & $22.7$ & $22.6$ & $22.2$ & $20.8$ & $26.8$ & \cellcolor{skyblue} $22.7$ \\
27
- Internvl-chat-v1-5$^\spadesuit$ & $19.9$ & $17.8$ & $20.5$ & $20.8$ & $26.8$ & \cellcolor{skyblue} $20.0$ \\
28
- Idefics2-8b$^\spadesuit$ & $27.9$ & $24.8$ & $26.5$ & $27.3$ & $28.6$ & \cellcolor{skyblue} $26.7$ \\
29
- \midrule
30
- GPT-4-vision$^\clubsuit$ & $46.3$ & $\bf 49.7$ & $39.7$ & $48.6$ & $\bf 50.7$ & \cellcolor{skyblue} $43.$1 \\
31
- GPT-4o$^\clubsuit$ & $\bf 46.6$ & $45.5$ & $\bf 41.9$ & $\bf 53.0$ & $50.0$ & \cellcolor{skyblue} $\bf 47.2$ \\
32
- Gemini Ultra$^\clubsuit$ & $27.9$ & $29.4$ & $20.2$ & $35.7$ & $29.5$ & \cellcolor{skyblue} $31.9$ \\
33
- Claude 3 Opus$^\clubsuit$ & $28.8$ & $26.3$ & $22.6$ & $35.7$ & $33.0$ & \cellcolor{skyblue} $29.8$ \\
34
- \bottomrule
35
- \end{tabular}}
36
- \label{exp:alignment_narrative_5}
37
- \end{table}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench/latex_reults/alignment_number_10.tex DELETED
@@ -1,29 +0,0 @@
1
-
2
- \begin{table}[h]
3
- \centering
4
- \caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback are provided in numerical scale of range [0, 10]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.}
5
- \resizebox{0.9\linewidth}{!}{%
6
- \begin{tabular}{c|cccccc}
7
- \toprule
8
- & Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg \\
9
- \midrule
10
- LLaVA-1.5-7b$^\heartsuit$ & $20.7$ & $25.2$ & $23.1$ & $18.2$ & $17.9$ & \cellcolor{skyblue} $22.0$ \\
11
- LLaVA-1.5-13b$^\heartsuit$ & $17.7$ & $13.5$ & $11.8$ & $16.5$ & $8.9$ & \cellcolor{skyblue} $10.3$ \\
12
- LLaVA-NeXT-mistral-7b$^\heartsuit$ & $25.9$ & $30.0$ & $41.9$ & $33.8$ & $35.7$ & \cellcolor{skyblue} $31.3$ \\
13
- LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $25.9$ & $27.4$ & $31.6$ & $38.9$ & $32.1$ & \cellcolor{skyblue} $29.1$ \\
14
- Instructblip-7b$^\heartsuit$ & $17.1$ & $17.4$ & $16.2$ & $13.1$ & $21.4$ & \cellcolor{skyblue} $17.1$ \\
15
- MiniGPT4-v2$^\heartsuit$ & $37.5$ & $30.9$ & $30.8$ & $32.5$ & $39.3$ & \cellcolor{skyblue} $32.8$ \\
16
- Prometheus-Vision-7b$^\heartsuit$ & $19.5$ & $15.2$ & $16.2$ & $22.1$ & $26.8$ & \cellcolor{skyblue} $18.8$ \\
17
- Prometheus-Vision-13b$^\heartsuit$ & $14.3$ & $10.9$ & $9.4$ & $11.7$ & $16.1$ & \cellcolor{skyblue} $11.8$ \\
18
- Qwen-VL-Chat$^\spadesuit$ & $30.7$ & $29.1$ & $35.9$ & $29.9$ & $32.1$ & \cellcolor{skyblue} $31.1$ \\
19
- Internvl-chat-v1-5$^\spadesuit$ & $\bf 73.3$ & $\bf 74.8$ & $\bf 78.6$ & $\bf 80.5$ & $\bf 78.6$ & \cellcolor{skyblue} $\bf 75.8$ \\
20
- Idefics2-8b$^\spadesuit$ & $35.5$ & $31.7$ & $30.8$ & $29.9$ & $30.4$ & \cellcolor{skyblue} $32.6$ \\
21
- \midrule
22
- GPT-4-vision$^\clubsuit$ & $68.1$ & $62.9$ & $64.1$ & $67.1$ & $73.2$ & \cellcolor{skyblue} $66.1$ \\
23
- GPT-4o$^\clubsuit$ & $62.2$ & $57.2$ & $64.1$ & $63.2$ & $67.9$ & \cellcolor{skyblue} $61.5$ \\
24
- Gemini Ultra$^\clubsuit$ & $71.7$ & $65.1$ & $63.2$ & $64.5$ & $67.8$ & \cellcolor{skyblue} $67.2$ \\
25
- Claude 3 Opus$^\clubsuit$ & $64.9$ & $38.9$ & $44.4$ & $55.3$ & $55.4$ & \cellcolor{skyblue} $57.1$ \\
26
- \bottomrule
27
- \end{tabular}}
28
- \label{exp:alignment_number_10}
29
- \end{table}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench/latex_reults/alignment_number_5.tex DELETED
@@ -1,35 +0,0 @@
1
- \begin{table}[h]
2
- \centering
3
- \caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback is provided in the numerical scale of range [0, 5]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.}
4
- \resizebox{0.9\linewidth}{!}{%
5
- \begin{tabular}{c|cccccc}
6
- \toprule
7
- & Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg \\
8
- \midrule
9
- % CLIP-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
10
- % BLIP-v2$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
11
- % PickScore-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
12
- % HPS-v2.1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
13
- % ImageReward$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
14
- % Aesthetics$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
15
- % \midrule
16
- LLaVA-1.5-7b$^\heartsuit$ & 27.1 & 25.7 & 28.2 & 26.0 & 26.8 & \cellcolor{skyblue} 26.8 \\
17
- LLaVA-1.5-13b$^\heartsuit$ & 11.2 & 14.5 & 12.8 & 7.80 & 14.3 & \cellcolor{skyblue} 12.1 \\
18
- LLaVA-NeXT-mistral-7b$^\heartsuit$ & 27.9 & 28.3 & 29.1 & 24.7 & 25.0 & \cellcolor{skyblue} 27.0 \\
19
- LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 28.7 & 21.3 & 31.6 & 28.6 & 26.8 & \cellcolor{skyblue} 27.4 \\
20
- Instructblip-7b$^\heartsuit$ & 19.9 & 20.9 & 25.6 & 18.2 & 19.6 & \cellcolor{skyblue} 20.8 \\
21
- MiniGPT4-v2$^\heartsuit$ & 27.5 & 26.1 & 32.5 & 37.7 & 26.8 & \cellcolor{skyblue} 30.1 \\
22
- Prometheus-Vision-7b$^\heartsuit$ & 18.7 & 13.5 & 14.5 & 19.5 & 25.0 & \cellcolor{skyblue} 18.2 \\
23
- Prometheus-Vision-13b$^\heartsuit$ & 12.4 & 11.3 & 9.4 & 11.7 & 12.5 & \cellcolor{skyblue} 11.5 \\
24
- Qwen-VL-Chat$^\spadesuit$ & 30.3 & 34.8 & 39.3 & 40.3 & 35.7 & \cellcolor{skyblue} 36.1 \\
25
- Internvl-chat-v1-5$^\spadesuit$ & 24.7 & 28.7 & 25.6 & 29.9 & 37.5 & \cellcolor{skyblue} 29.3 \\
26
- Idefics2-8b$^\spadesuit$ & 17.1 & 17.0 & 13.5 & 14.3 & 19.6 & \cellcolor{skyblue} 16.3 \\
27
- \midrule
28
- GPT-4-vision$^\clubsuit$ & \bf 45.3 & \bf 46.3 & 41.3 & 48.3 & 48.3 & \cellcolor{skyblue} 45.9 \\
29
- GPT-4o$^\clubsuit$ & 44.2 & 45.3 & \bf 43.3 & \bf 53.4 & \bf 51.3 & \cellcolor{skyblue} \bf 48.6 \\
30
- Gemini Ultra$^\clubsuit$ & 31.7 & 29.7 & 23.7 & 39.7 & 32.7 & \cellcolor{skyblue} 29.9 \\
31
- Claude 3 Opus$^\clubsuit$ & 24.9 & 28.9 & 25.9 & 31.2 & 29.2 & \cellcolor{skyblue} 26.3 \\
32
- \bottomrule
33
- \end{tabular}}
34
- \label{exp:alignment_number_5}
35
- \end{table}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench/latex_reults/artifact_narrative.tex DELETED
@@ -1,29 +0,0 @@
1
- \begin{table}[h]
2
- \centering
3
- \caption{The detailed evaluation result of all multimodal judges on \textbf{quality} perspective. The feedback is provided in the following Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. Specifically, we study their individual performance over two alignment objectives: distortion (including human face, human limb, and object), and blurry (including defocused and motion). The best performance across all models is bolded.}
4
- \resizebox{1.0\linewidth}{!}{%
5
- \begin{tabular}{c|cccc|ccc}
6
- \toprule
7
- & \multicolumn{4}{c}{\bf Distortion} & \multicolumn{3}{c}{\bf Blurry} \\
8
- & Human Face & Human Limb & Object & \cellcolor{skyblue}Avg & Defocused & Motion & \cellcolor{skyblue}Avg \\
9
- \midrule
10
- LLaVA-1.5-7b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 1.80 & 10.6 & \cellcolor{skyblue} 6.50 \\
11
- LLaVA-1.5-13b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 18.7 & 29.7 & \cellcolor{skyblue} 24.9 \\
12
- LLaVA-NeXT-mistral-7b$^\heartsuit$ & 10.8 & 14.2 & 1.30 & \cellcolor{skyblue} 9.10 & 56.7 & 73.0 & \cellcolor{skyblue} 61.3 \\
13
- LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 19.6 & 14.3 & 13.9 & \cellcolor{skyblue} 16.8 & 25.8 & 27.3 & \cellcolor{skyblue} 26.6 \\
14
- Instructblip-7b$^\heartsuit$ & 9.80 & 3.00 & 18.7 & \cellcolor{skyblue} 10.9 & 9.80 & 9.90 & \cellcolor{skyblue} 9.50 \\
15
- Prometheus-Vision-7b$^\heartsuit$ & 19.8 & 15.6 & 12.2 & \cellcolor{skyblue} 16.0 & 26.0 & 29.2 & \cellcolor{skyblue} 27.2 \\
16
- Prometheus-Vision-13b$^\heartsuit$ & 7.40 & 5.10 & 7.30 & \cellcolor{skyblue} 6.80 & 9.40 & 11.7 & \cellcolor{skyblue} 11.1 \\
17
- Qwen-VL-Chat$^\spadesuit$ & 25.2 & 21.6 & 6.70 & \cellcolor{skyblue} 17.4 & 18.8 & 20.1 & \cellcolor{skyblue} 19.3 \\
18
- Internvl-chat-v1-5$^\spadesuit$ & 22.1 & 24.2 & 1.20 &\cellcolor{skyblue} 16.0 & \bf 94.2 & 96.1 & \cellcolor{skyblue} \bf 95.3 \\
19
- Idefics2-8b$^\spadesuit$ & 40.9 & 29.6 & 10.1 & \cellcolor{skyblue} 27.0 & 90.2 & 67.5 & \cellcolor{skyblue} 79.2 \\
20
- \midrule
21
- GPT-4-vision$^\clubsuit$ & 86.9 & 54.4 & 78.7 & \cellcolor{skyblue} 71.5 & 90.6 & \bf 93.5 & \cellcolor{skyblue} 93.6 \\
22
- GPT-4o$^\clubsuit$ & \bf 98.2 & \bf 71.1 & \bf 89.9 & \cellcolor{skyblue} \bf 83.6 & 91.8 & 96.1 & \cellcolor{skyblue} 91.6 \\
23
- Gemini Ultra$^\clubsuit$ & 71.3 & 30.5 & 59.2 & \cellcolor{skyblue} 48.8 & 80.6 & 90.9 & \cellcolor{skyblue} 79.5 \\
24
- Claude 3 Opus$^\clubsuit$ & 21.3 & 17.2 & 9.50 & \cellcolor{skyblue} 14.0 & 85.9 & 93.1 & \cellcolor{skyblue} 83.7 \\
25
- \bottomrule
26
- \end{tabular}%
27
- }
28
- \label{exp:artifact_result_narrative_5}
29
- \end{table}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/mjbench/latex_reults/artifact_number_10.tex DELETED
@@ -1,38 +0,0 @@
1
-
2
- \begin{table}[h]
3
- \centering
4
- \caption{The detailed evaluation result of all multimodal judges on \textbf{quality} perspective. The feedback are provided in numerical scale of range [0, 10]. Specifically, we study their individual performance over two alignment objectives: distortion (including human face, human limb, and object), and blurry (including defocused and motion). The best performance across all models is bolded.}
5
- \resizebox{1.0\linewidth}{!}{%
6
- \begin{tabular}{c|cccc|ccc}
7
- \toprule
8
- & \multicolumn{4}{c}{\bf Distortion} & \multicolumn{3}{c}{\bf Blurry} \\
9
- & Human Face & Human Limb & Object & \cellcolor{skyblue}Avg & Defocused & Motion & \cellcolor{skyblue}Avg \\
10
- \midrule
11
- CLIP-v1$^\diamondsuit$ & $26.6$ & $17.2$ & $34.0$ & \cellcolor{skyblue} $19.3$ & $50.6$ & $63.7$ & \cellcolor{skyblue} $56.7$ \\
12
- BLIP-v2$^\diamondsuit$ & $3.60$ & $2.00$ & $1.10$ & \cellcolor{skyblue} $1.90$ & $8.30$ & $47.2$ & \cellcolor{skyblue} $15.0$ \\
13
- PickScore-v1$^\diamondsuit$ & $83.4$ & $68.2$ & $92.1$ & \cellcolor{skyblue} $79.3$ & $80.6$ & $93.4$ & \cellcolor{skyblue} $86.6$ \\
14
- HPS-v2.1$^\diamondsuit$ & $60.4$ & $37.1$ & $80.3$ & \cellcolor{skyblue} $51.7$ & $85.7$ & $94.6$ & \cellcolor{skyblue} $88.6$ \\
15
- ImageReward$^\diamondsuit$ & $31.4$ & $34.4$ & $40.2$ & \cellcolor{skyblue} $33.3$ & $77.4$ & $86.6$ & \cellcolor{skyblue} $82.1$ \\
16
- Aesthetics$^\diamondsuit$ & $78.7$ & $57.1$ & $51.3$ & \cellcolor{skyblue} $52.1$ & $90.1$ & $93.4$ & \cellcolor{skyblue} $91.6$ \\
17
- \midrule
18
- LLaVA-1.5-7b$^\heartsuit$ & $13.6$ & $7.30$ & $9.20$ & \cellcolor{skyblue} $10.2$ & $7.10$ & $19.1$ & \cellcolor{skyblue} $13.1$ \\
19
- LLaVA-1.5-13b$^\heartsuit$ & $20.1$ & $14.6$ & $13.3$ & \cellcolor{skyblue} $16.4$ & $18.0$ & $34.0$ & \cellcolor{skyblue} $26.1$ \\
20
- LLaVA-NeXT-7b$^\heartsuit$ & $28.4$ & $27.8$ & $19.0$ & \cellcolor{skyblue} $30.1$ & $41.7$ & $66.1$ & \cellcolor{skyblue} $53.9$ \\
21
- LLaVA-NeXT-13b$^\heartsuit$ & $18.9$ & $27.8$ & $12.0$ & \cellcolor{skyblue} $20.5$ & $40.6$ & $45.4$ & \cellcolor{skyblue} $43.0$ \\
22
- Instructblip-7b$^\heartsuit$ & $12.4$ & $9.30$ & $21.0$ & \cellcolor{skyblue} $13.3$ & $32.3$ & $31.1$ & \cellcolor{skyblue} $31.7$ \\
23
- MiniGPT4-v2$^\heartsuit$ & $39.6$ & $39.1$ & $42.0$ & \cellcolor{skyblue} $40.0$ & $33.4$ & $37.4$ & \cellcolor{skyblue} $35.4$ \\
24
- Prometheus-Vision-7b$^\heartsuit$ & $16.6$ & $17.9$ & $14.1$ & \cellcolor{skyblue} $16.4$ & $22.3$ & $30.3$ & \cellcolor{skyblue} $26.3$ \\
25
- Prometheus-Vision-13b$^\heartsuit$ & $7.10$ & $4.60$ & $7.20$ & \cellcolor{skyblue} $6.20$ & $9.40$ &$10.6$ & \cellcolor{skyblue} $10.0$ \\
26
- Qwen-VL-Chat$^\spadesuit$ & $14.2$ & $15.9$ & $9.40$ & \cellcolor{skyblue} $13.6$ & $0.90$ & $2.10$ & \cellcolor{skyblue} $1.40$ \\
27
- Internvl-chat-v1-5$^\spadesuit$ & $97.0$ & $\bf 95.4$ & $97.1$ & \cellcolor{skyblue} $\bf 97.1$ & $89.7$ & $89.7$ & \cellcolor{skyblue} $89.7$ \\
28
- Idefics2-8b$^\spadesuit$ & $29.6$ & $25.8$ & $2.30$ & \cellcolor{skyblue} $21.7$ & $70.6$ & $46.9$ & \cellcolor{skyblue} $58.7$ \\
29
- \midrule
30
- GPT-4-vision$^\clubsuit$ & $87.6$ & $57.6$ & $83.1$ & \cellcolor{skyblue} $75.7$ & $98.8$ & $99.3$ & \cellcolor{skyblue} $99.2$ \\
31
- GPT-4o$^\clubsuit$ & $\bf 99.4$ & $78.2$ & $\bf 100$ & \cellcolor{skyblue} $93.8$ & $\bf 100$ & $\bf 100$ & \cellcolor{skyblue} $\bf 100$ \\
32
- Gemini Ultra$^\clubsuit$ & $73.4$ & $32.5$ & $61.0$ & \cellcolor{skyblue} $55.7$ & $86.5$ & $97.3$ & \cellcolor{skyblue} $93.9$ \\
33
- Claude 3 Opus$^\clubsuit$ & $26.6$ & $19.3$ & $10.7$ & \cellcolor{skyblue} $17.6$ & $89.6$ & $93.3$ & \cellcolor{skyblue} $92.7$ \\
34
- \bottomrule
35
- \end{tabular}%
36
- }
37
- \label{exp:artifact_result_number_10}
38
- \end{table}