Leaderboard

Running

App Files Files Community

yichao commited on Jun 30

Commit

9c9b7f5

•

1 Parent(s): d04ff35

update mj-bench

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +11 -2
evals/.gitattributes +55 -0
evals/README.md +6 -0
evals/{mjbench → mjbench-results}/detailed-results/AestheticsPredictor.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/BLIP-v2.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/CLIP-v2.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/Claude 3 Opus.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/GPT-4-vision.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/GPT-4o.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/Gemini Ultra.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/HPS-v2.1.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/Idefics2-8b.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/ImageReward.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/Instructblip-7b.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/InternVL-Chat-V1-5.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/LLaVA-1.5-13b.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/LLaVA-1.5-7b.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/LLaVA-NeXT-mistral-7b.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/LLaVA-NeXT-vicuna-13b.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/MiniGPT4-v2.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/PickScore-v1.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/Prometheus-Vision-13b.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/Prometheus-Vision-7b.json +0 -0
evals/{mjbench → mjbench-results}/detailed-results/Qwen-VL-Chat.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/AestheticsPredictor.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/BLIP-v2.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/CLIP-v2.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/Claude 3 Opus.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/GPT-4-vision.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/GPT-4o.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/Gemini Ultra.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/HPS-v2.1.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/Idefics2-8b.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/ImageReward.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/Instructblip-7b.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/InternVL-Chat-V1-5.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/LLaVA-1.5-13b.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/LLaVA-1.5-7b.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/LLaVA-NeXT-mistral-7b.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/LLaVA-NeXT-vicuna-13b.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/MiniGPT4-v2.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/PickScore-v1.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/Prometheus-Vision-13b.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/Prometheus-Vision-7b.json +0 -0
evals/{mjbench → mjbench-results}/overall-results/Qwen-VL-Chat.json +0 -0
evals/mjbench/latex_reults/alignment_narrative.tex +0 -37
evals/mjbench/latex_reults/alignment_number_10.tex +0 -29
evals/mjbench/latex_reults/alignment_number_5.tex +0 -35
evals/mjbench/latex_reults/artifact_narrative.tex +0 -29
evals/mjbench/latex_reults/artifact_number_10.tex +0 -38

app.py CHANGED Viewed

@@ -35,6 +35,14 @@ from src.display.utils import (
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 SUBSET_COUNTS = {
     "Alignment-Object": 250,
     "Alignment-Attribute": 229,
@@ -71,6 +79,7 @@ PERSPECTIVE_COUNTS= {
 META_DATA = ['Model', 'Model Type', 'Input Type', 'Organization']
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
@@ -192,12 +201,12 @@ def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=MET
     return new_df
-results_path = Path("./evals/mjbench/eval-results")
 orig_df = get_leaderboard_results(results_path)
 colmuns_name = list(SUBSET_COUNTS.keys())
 detailed_df = avg_all_subset(orig_df, colmuns_name).round(2)
-results_path = Path("./evals/mjbench/overall-results")
 orig_df = get_leaderboard_results(results_path)
 colmuns_name = list(PERSPECTIVE_COUNTS.keys())
 perspective_df = avg_all_perspective(orig_df, colmuns_name).round(2)

 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
+try:
+    print(EVAL_RESULTS_PATH)
+    snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    restart_space()
 SUBSET_COUNTS = {
     "Alignment-Object": 250,
     "Alignment-Attribute": 229,
 META_DATA = ['Model', 'Model Type', 'Input Type', 'Organization']
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
     return new_df
+results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/detailed-results")
 orig_df = get_leaderboard_results(results_path)
 colmuns_name = list(SUBSET_COUNTS.keys())
 detailed_df = avg_all_subset(orig_df, colmuns_name).round(2)
+results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/overall-results")
 orig_df = get_leaderboard_results(results_path)
 colmuns_name = list(PERSPECTIVE_COUNTS.keys())
 perspective_df = avg_all_perspective(orig_df, colmuns_name).round(2)

evals/.gitattributes ADDED Viewed

	@@ -0,0 +1,55 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.lz4 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+# Audio files - uncompressed
+*.pcm filter=lfs diff=lfs merge=lfs -text
+*.sam filter=lfs diff=lfs merge=lfs -text
+*.raw filter=lfs diff=lfs merge=lfs -text
+# Audio files - compressed
+*.aac filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+# Image files - uncompressed
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
+# Image files - compressed
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text

evals/README.md ADDED Viewed

	@@ -0,0 +1,6 @@

+---
+# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1
+# Doc / guide: https://huggingface.co/docs/hub/datasets-cards
+{}
+---
+# Coming Soon

evals/{mjbench → mjbench-results}/detailed-results/AestheticsPredictor.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/BLIP-v2.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/CLIP-v2.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/Claude 3 Opus.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/GPT-4-vision.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/GPT-4o.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/Gemini Ultra.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/HPS-v2.1.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/Idefics2-8b.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/ImageReward.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/Instructblip-7b.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/InternVL-Chat-V1-5.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/LLaVA-1.5-13b.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/LLaVA-1.5-7b.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/LLaVA-NeXT-mistral-7b.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/LLaVA-NeXT-vicuna-13b.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/MiniGPT4-v2.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/PickScore-v1.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/Prometheus-Vision-13b.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/Prometheus-Vision-7b.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/detailed-results/Qwen-VL-Chat.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/AestheticsPredictor.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/BLIP-v2.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/CLIP-v2.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/Claude 3 Opus.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/GPT-4-vision.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/GPT-4o.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/Gemini Ultra.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/HPS-v2.1.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/Idefics2-8b.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/ImageReward.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/Instructblip-7b.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/InternVL-Chat-V1-5.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/LLaVA-1.5-13b.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/LLaVA-1.5-7b.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/LLaVA-NeXT-mistral-7b.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/LLaVA-NeXT-vicuna-13b.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/MiniGPT4-v2.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/PickScore-v1.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/Prometheus-Vision-13b.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/Prometheus-Vision-7b.json RENAMED Viewed

File without changes

evals/{mjbench → mjbench-results}/overall-results/Qwen-VL-Chat.json RENAMED Viewed

File without changes

evals/mjbench/latex_reults/alignment_narrative.tex DELETED Viewed

@@ -1,37 +0,0 @@
-\begin{table}[h]
-    \centering
-    \caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback are provided in the following Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.}
-    \resizebox{0.9\linewidth}{!}{%
-    \begin{tabular}{c|cccccc}
-    \toprule
-         & Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg       \\
-         \midrule
-         % CLIP-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % BLIP-v2$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % PickScore-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % HPS-v2.1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % ImageReward$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % Aesthetics$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} -  \\
-         % \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & $19.1$ & $17.8$ & $20.5$ & $16.9$ & $25.0$ & \cellcolor{skyblue} $19.2$ \\
-         LLaVA-1.5-13b$^\heartsuit$ & $22.7$ & $21.3$ & $22.2$ & $15.6
-$ & $17.9$ & \cellcolor{skyblue} $21.1$ \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$ & $19.1$ & $17.8$ & $16.2$ & $10.4$ & $12.5$ & \cellcolor{skyblue} $16.8$ \\
-         LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $22.7$ & $21.3$ & $17.1$ & $20.8$ & $16.1$ & \cellcolor{skyblue} $20.7$ \\
-         Instructblip-7b$^\heartsuit$ & $22.3$ & $20.9$ & $17.1
-$ & $15.6$ & $7.10$ & \cellcolor{skyblue} $19.2$  \\
-         MiniGPT4-v2$^\heartsuit$ & $21.1$ & $27.0$ & $22.2$ & $23.4$ & $23.2$ & \cellcolor{skyblue} $23.5$  \\
-         Prometheus-Vision-7b$^\heartsuit$ & $21.9$ & $17.4$ & $21.4$ & $18.2$ & $5.40$ & \cellcolor{skyblue} $18.7$ \\
-         Prometheus-Vision-13b$^\heartsuit$ & $15.1$ & $13.9$ & $12.8$ & $11.5$ & $5.40$ & \cellcolor{skyblue} $13.3$ \\
-         Qwen-VL-Chat$^\spadesuit$ & $22.7$ & $22.6$ & $22.2$ & $20.8$ & $26.8$ & \cellcolor{skyblue} $22.7$  \\
-         Internvl-chat-v1-5$^\spadesuit$ & $19.9$ & $17.8$ & $20.5$ & $20.8$ & $26.8$ & \cellcolor{skyblue} $20.0$ \\
-         Idefics2-8b$^\spadesuit$ & $27.9$ & $24.8$ & $26.5$ & $27.3$ & $28.6$ & \cellcolor{skyblue} $26.7$ \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & $46.3$ & $\bf 49.7$ & $39.7$ & $48.6$ & $\bf 50.7$ & \cellcolor{skyblue} $43.$1 \\
-         GPT-4o$^\clubsuit$ & $\bf 46.6$ & $45.5$ & $\bf 41.9$ & $\bf 53.0$ & $50.0$ & \cellcolor{skyblue} $\bf 47.2$ \\
-         Gemini Ultra$^\clubsuit$ & $27.9$ & $29.4$ & $20.2$ & $35.7$ & $29.5$ & \cellcolor{skyblue} $31.9$ \\
-         Claude 3 Opus$^\clubsuit$ & $28.8$ & $26.3$ & $22.6$ & $35.7$ & $33.0$ & \cellcolor{skyblue} $29.8$ \\
-    \bottomrule
-    \end{tabular}}
-    \label{exp:alignment_narrative_5}
-\end{table}

evals/mjbench/latex_reults/alignment_number_10.tex DELETED Viewed

@@ -1,29 +0,0 @@
-\begin{table}[h]
-    \centering
-    \caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback are provided in numerical scale of range [0, 10]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.}
-    \resizebox{0.9\linewidth}{!}{%
-    \begin{tabular}{c|cccccc}
-    \toprule
-         & Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg       \\
-         \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & $20.7$ & $25.2$ & $23.1$ & $18.2$ & $17.9$ & \cellcolor{skyblue} $22.0$ \\
-         LLaVA-1.5-13b$^\heartsuit$ & $17.7$ & $13.5$ & $11.8$ & $16.5$ & $8.9$ & \cellcolor{skyblue} $10.3$ \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$ & $25.9$ & $30.0$ & $41.9$ & $33.8$ & $35.7$ & \cellcolor{skyblue} $31.3$ \\
-         LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $25.9$ & $27.4$ & $31.6$ & $38.9$ & $32.1$ & \cellcolor{skyblue} $29.1$ \\
-         Instructblip-7b$^\heartsuit$ & $17.1$ & $17.4$ & $16.2$ & $13.1$ & $21.4$ & \cellcolor{skyblue} $17.1$  \\
-         MiniGPT4-v2$^\heartsuit$ & $37.5$ & $30.9$ & $30.8$ & $32.5$ & $39.3$ & \cellcolor{skyblue} $32.8$  \\
-         Prometheus-Vision-7b$^\heartsuit$ & $19.5$ & $15.2$ & $16.2$ & $22.1$ & $26.8$ & \cellcolor{skyblue} $18.8$ \\
-         Prometheus-Vision-13b$^\heartsuit$ & $14.3$ & $10.9$ & $9.4$ & $11.7$ & $16.1$ & \cellcolor{skyblue} $11.8$ \\
-         Qwen-VL-Chat$^\spadesuit$ & $30.7$ & $29.1$ & $35.9$ & $29.9$ & $32.1$ & \cellcolor{skyblue} $31.1$  \\
-         Internvl-chat-v1-5$^\spadesuit$ & $\bf 73.3$ & $\bf 74.8$ & $\bf 78.6$ & $\bf 80.5$ & $\bf 78.6$ & \cellcolor{skyblue} $\bf 75.8$ \\
-         Idefics2-8b$^\spadesuit$ & $35.5$ & $31.7$ & $30.8$ & $29.9$ & $30.4$ & \cellcolor{skyblue} $32.6$ \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & $68.1$ & $62.9$ & $64.1$ & $67.1$ & $73.2$ & \cellcolor{skyblue} $66.1$ \\
-         GPT-4o$^\clubsuit$ & $62.2$ & $57.2$ & $64.1$ & $63.2$ & $67.9$ & \cellcolor{skyblue} $61.5$ \\
-         Gemini Ultra$^\clubsuit$ & $71.7$ & $65.1$ & $63.2$ & $64.5$ & $67.8$ & \cellcolor{skyblue} $67.2$ \\
-         Claude 3 Opus$^\clubsuit$ & $64.9$ & $38.9$ & $44.4$ & $55.3$ & $55.4$ & \cellcolor{skyblue} $57.1$ \\
-    \bottomrule
-    \end{tabular}}
-    \label{exp:alignment_number_10}
-\end{table}

evals/mjbench/latex_reults/alignment_number_5.tex DELETED Viewed

@@ -1,35 +0,0 @@
-\begin{table}[h]
-    \centering
-    \caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback is provided in the numerical scale of range [0, 5]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.}
-    \resizebox{0.9\linewidth}{!}{%
-    \begin{tabular}{c|cccccc}
-    \toprule
-         & Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg       \\
-         \midrule
-         % CLIP-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % BLIP-v2$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % PickScore-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % HPS-v2.1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % ImageReward$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
-         % Aesthetics$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} -  \\
-         % \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & 27.1 & 25.7 & 28.2 & 26.0 & 26.8 & \cellcolor{skyblue} 26.8 \\
-         LLaVA-1.5-13b$^\heartsuit$ & 11.2 & 14.5 & 12.8 & 7.80 & 14.3 & \cellcolor{skyblue} 12.1 \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$ & 27.9 & 28.3 & 29.1 & 24.7 & 25.0 & \cellcolor{skyblue} 27.0 \\
-         LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 28.7 & 21.3 & 31.6 & 28.6 & 26.8 & \cellcolor{skyblue} 27.4 \\
-         Instructblip-7b$^\heartsuit$ & 19.9 & 20.9 & 25.6 & 18.2 & 19.6 & \cellcolor{skyblue} 20.8  \\
-         MiniGPT4-v2$^\heartsuit$ & 27.5 & 26.1 & 32.5 & 37.7 & 26.8 & \cellcolor{skyblue} 30.1  \\
-         Prometheus-Vision-7b$^\heartsuit$ & 18.7 & 13.5 & 14.5 & 19.5 & 25.0 & \cellcolor{skyblue} 18.2 \\
-         Prometheus-Vision-13b$^\heartsuit$ & 12.4 & 11.3 & 9.4 & 11.7 & 12.5 & \cellcolor{skyblue} 11.5 \\
-         Qwen-VL-Chat$^\spadesuit$ & 30.3 & 34.8 & 39.3 & 40.3 & 35.7 & \cellcolor{skyblue} 36.1  \\
-         Internvl-chat-v1-5$^\spadesuit$ & 24.7 & 28.7 & 25.6 & 29.9 & 37.5 & \cellcolor{skyblue} 29.3 \\
-         Idefics2-8b$^\spadesuit$ & 17.1 & 17.0 & 13.5 & 14.3 & 19.6 & \cellcolor{skyblue} 16.3 \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & \bf 45.3 & \bf 46.3 & 41.3 & 48.3 & 48.3 & \cellcolor{skyblue} 45.9  \\
-         GPT-4o$^\clubsuit$ & 44.2 & 45.3 & \bf 43.3 & \bf 53.4 & \bf 51.3 & \cellcolor{skyblue} \bf 48.6 \\
-         Gemini Ultra$^\clubsuit$ & 31.7 & 29.7 & 23.7 & 39.7 & 32.7 & \cellcolor{skyblue} 29.9 \\
-         Claude 3 Opus$^\clubsuit$ & 24.9 & 28.9 & 25.9 & 31.2 & 29.2 & \cellcolor{skyblue} 26.3 \\
-    \bottomrule
-    \end{tabular}}
-    \label{exp:alignment_number_5}
-\end{table}

evals/mjbench/latex_reults/artifact_narrative.tex DELETED Viewed

@@ -1,29 +0,0 @@
-\begin{table}[h]
-    \centering
-    \caption{The detailed evaluation result of all multimodal judges on \textbf{quality} perspective. The feedback is provided in the following Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. Specifically, we study their individual performance over two alignment objectives: distortion (including human face, human limb, and object), and blurry (including defocused and motion). The best performance across all models is bolded.}
-    \resizebox{1.0\linewidth}{!}{%
-    \begin{tabular}{c|cccc|ccc}
-    \toprule
-         & \multicolumn{4}{c}{\bf Distortion} & \multicolumn{3}{c}{\bf Blurry} \\
-         & Human Face & Human Limb & Object & \cellcolor{skyblue}Avg  & Defocused & Motion & \cellcolor{skyblue}Avg \\
-         \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 1.80 & 10.6 & \cellcolor{skyblue} 6.50 \\
-         LLaVA-1.5-13b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 18.7 & 29.7 & \cellcolor{skyblue} 24.9 \\
-         LLaVA-NeXT-mistral-7b$^\heartsuit$ & 10.8 & 14.2 & 1.30 & \cellcolor{skyblue} 9.10 & 56.7 & 73.0 & \cellcolor{skyblue} 61.3 \\
-         LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 19.6 & 14.3 & 13.9 & \cellcolor{skyblue} 16.8 & 25.8 & 27.3 & \cellcolor{skyblue} 26.6 \\
-        Instructblip-7b$^\heartsuit$ & 9.80 & 3.00 & 18.7 & \cellcolor{skyblue} 10.9 & 9.80 & 9.90 & \cellcolor{skyblue} 9.50 \\
-        Prometheus-Vision-7b$^\heartsuit$ & 19.8 & 15.6 & 12.2 & \cellcolor{skyblue} 16.0 & 26.0 & 29.2 & \cellcolor{skyblue} 27.2 \\
-        Prometheus-Vision-13b$^\heartsuit$ & 7.40 & 5.10 & 7.30 & \cellcolor{skyblue} 6.80 & 9.40 & 11.7 & \cellcolor{skyblue} 11.1 \\
-        Qwen-VL-Chat$^\spadesuit$ & 25.2 & 21.6 & 6.70 & \cellcolor{skyblue} 17.4 & 18.8 & 20.1 & \cellcolor{skyblue} 19.3 \\
-        Internvl-chat-v1-5$^\spadesuit$ & 22.1 & 24.2 & 1.20 &\cellcolor{skyblue} 16.0 & \bf 94.2 & 96.1 & \cellcolor{skyblue} \bf 95.3 \\
-         Idefics2-8b$^\spadesuit$ & 40.9 & 29.6 & 10.1 & \cellcolor{skyblue} 27.0 & 90.2 & 67.5 & \cellcolor{skyblue} 79.2 \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & 86.9 & 54.4 & 78.7 & \cellcolor{skyblue} 71.5 & 90.6 & \bf 93.5 & \cellcolor{skyblue} 93.6 \\
-         GPT-4o$^\clubsuit$ & \bf 98.2 & \bf 71.1 & \bf 89.9  & \cellcolor{skyblue} \bf 83.6 & 91.8 & 96.1 & \cellcolor{skyblue} 91.6 \\
-         Gemini Ultra$^\clubsuit$ & 71.3 & 30.5 & 59.2 & \cellcolor{skyblue} 48.8 & 80.6 & 90.9 & \cellcolor{skyblue} 79.5 \\
-         Claude 3 Opus$^\clubsuit$ & 21.3 & 17.2 & 9.50 & \cellcolor{skyblue} 14.0 & 85.9 & 93.1 & \cellcolor{skyblue} 83.7 \\
-    \bottomrule
-    \end{tabular}%
-    }
-    \label{exp:artifact_result_narrative_5}
-\end{table}

evals/mjbench/latex_reults/artifact_number_10.tex DELETED Viewed

@@ -1,38 +0,0 @@
-\begin{table}[h]
-    \centering
-    \caption{The detailed evaluation result of all multimodal judges on \textbf{quality} perspective. The feedback are provided in numerical scale of range [0, 10]. Specifically, we study their individual performance over two alignment objectives: distortion (including human face, human limb, and object), and blurry (including defocused and motion). The best performance across all models is bolded.}
-    \resizebox{1.0\linewidth}{!}{%
-    \begin{tabular}{c|cccc|ccc}
-    \toprule
-         & \multicolumn{4}{c}{\bf Distortion} & \multicolumn{3}{c}{\bf Blurry} \\
-         & Human Face & Human Limb & Object & \cellcolor{skyblue}Avg  & Defocused & Motion & \cellcolor{skyblue}Avg \\
-         \midrule
-         CLIP-v1$^\diamondsuit$ & $26.6$ & $17.2$ & $34.0$ & \cellcolor{skyblue} $19.3$ & $50.6$ & $63.7$ & \cellcolor{skyblue} $56.7$ \\
-         BLIP-v2$^\diamondsuit$ & $3.60$ & $2.00$ & $1.10$ & \cellcolor{skyblue} $1.90$ & $8.30$ & $47.2$ & \cellcolor{skyblue} $15.0$ \\
-         PickScore-v1$^\diamondsuit$ & $83.4$ & $68.2$ & $92.1$ & \cellcolor{skyblue} $79.3$ & $80.6$ & $93.4$ & \cellcolor{skyblue} $86.6$ \\
-         HPS-v2.1$^\diamondsuit$ & $60.4$ & $37.1$ & $80.3$ & \cellcolor{skyblue} $51.7$ & $85.7$ & $94.6$ & \cellcolor{skyblue} $88.6$ \\
-         ImageReward$^\diamondsuit$ & $31.4$ & $34.4$ & $40.2$ & \cellcolor{skyblue} $33.3$ & $77.4$ & $86.6$ & \cellcolor{skyblue} $82.1$ \\
-         Aesthetics$^\diamondsuit$ & $78.7$ & $57.1$ & $51.3$ & \cellcolor{skyblue} $52.1$ & $90.1$ & $93.4$ & \cellcolor{skyblue} $91.6$ \\
-         \midrule
-         LLaVA-1.5-7b$^\heartsuit$ & $13.6$ & $7.30$ & $9.20$ & \cellcolor{skyblue} $10.2$ & $7.10$ & $19.1$ & \cellcolor{skyblue} $13.1$ \\
-         LLaVA-1.5-13b$^\heartsuit$ & $20.1$ & $14.6$ & $13.3$ & \cellcolor{skyblue} $16.4$ & $18.0$ & $34.0$ & \cellcolor{skyblue} $26.1$ \\
-         LLaVA-NeXT-7b$^\heartsuit$ & $28.4$ & $27.8$ & $19.0$ & \cellcolor{skyblue} $30.1$ & $41.7$ & $66.1$ & \cellcolor{skyblue} $53.9$ \\
-         LLaVA-NeXT-13b$^\heartsuit$ & $18.9$ & $27.8$ & $12.0$ & \cellcolor{skyblue} $20.5$ & $40.6$ & $45.4$ & \cellcolor{skyblue} $43.0$ \\
-         Instructblip-7b$^\heartsuit$ & $12.4$  & $9.30$ & $21.0$ & \cellcolor{skyblue} $13.3$ & $32.3$ & $31.1$ & \cellcolor{skyblue} $31.7$ \\
-         MiniGPT4-v2$^\heartsuit$ & $39.6$ & $39.1$ & $42.0$ & \cellcolor{skyblue} $40.0$ & $33.4$ & $37.4$ & \cellcolor{skyblue} $35.4$ \\
-         Prometheus-Vision-7b$^\heartsuit$ & $16.6$ & $17.9$ & $14.1$ & \cellcolor{skyblue} $16.4$ & $22.3$ & $30.3$ & \cellcolor{skyblue} $26.3$ \\
-         Prometheus-Vision-13b$^\heartsuit$ & $7.10$ & $4.60$ & $7.20$ & \cellcolor{skyblue} $6.20$ & $9.40$ &$10.6$ & \cellcolor{skyblue} $10.0$ \\
-         Qwen-VL-Chat$^\spadesuit$ & $14.2$ & $15.9$ & $9.40$ & \cellcolor{skyblue} $13.6$ & $0.90$ & $2.10$ & \cellcolor{skyblue} $1.40$ \\
-         Internvl-chat-v1-5$^\spadesuit$ & $97.0$ & $\bf 95.4$ & $97.1$ & \cellcolor{skyblue} $\bf 97.1$ & $89.7$ & $89.7$ & \cellcolor{skyblue} $89.7$ \\
-         Idefics2-8b$^\spadesuit$ & $29.6$ & $25.8$ & $2.30$ & \cellcolor{skyblue} $21.7$ & $70.6$ & $46.9$ & \cellcolor{skyblue} $58.7$ \\
-         \midrule
-         GPT-4-vision$^\clubsuit$ & $87.6$ & $57.6$ & $83.1$ & \cellcolor{skyblue} $75.7$ & $98.8$ & $99.3$ & \cellcolor{skyblue} $99.2$ \\
-         GPT-4o$^\clubsuit$ & $\bf 99.4$ & $78.2$ & $\bf 100$ & \cellcolor{skyblue} $93.8$ & $\bf 100$ & $\bf 100$ & \cellcolor{skyblue} $\bf 100$ \\
-         Gemini Ultra$^\clubsuit$ & $73.4$ & $32.5$ & $61.0$ & \cellcolor{skyblue} $55.7$ & $86.5$ & $97.3$ & \cellcolor{skyblue} $93.9$ \\
-         Claude 3 Opus$^\clubsuit$ & $26.6$ & $19.3$ & $10.7$ & \cellcolor{skyblue} $17.6$ & $89.6$ & $93.3$ & \cellcolor{skyblue} $92.7$ \\
-    \bottomrule
-    \end{tabular}%
-    }
-    \label{exp:artifact_result_number_10}
-\end{table}