yichao's picture
update mj-bench
b650828
raw
history blame
3.93 kB
\begin{table}[t]
\centering
\caption{Evaluation of three types of multimodal judges across four perspectives on \algname dataset. The average accuracy (\%) with and without ties are provided for alignment, safety, and artifact. We evaluate preference biases over three metrics, i.e. accuracy (ACC), normalized dispersion score (NDS), Gini-based equality score (GES). The best performance across all models is bolded.}
\setlength{\tabcolsep}{2pt}
\renewcommand{\arraystretch}{0.9}
\resizebox{1.0\linewidth}{!}{%
\begin{tabular}{l|cc|cc|cc|ccc}
\toprule
& \multicolumn{2}{c}{\bf Alignment} & \multicolumn{2}{c}{\bf Safety} & \multicolumn{2}{c}{\bf Artifact} & \multicolumn{3}{c}{\bf Bias} \\
& Avg w/ tie & Avg w/o Tie & Avg w/ tie & Avg w/o Tie & Avg w/ tie & Avg w/o Tie & ACC & NDS & GES \\
\midrule
CLIP-v1$^\diamondsuit$ & $38.1$ & $59.5$ & $12.7$ & $33.3$ & $34.4$ & $68.4$ & $57.4$ & $76.3$ & $86.9$ \\
BLIP-v2$^\diamondsuit$ & $17.3$ & $38.8$ & $44.0$ & $65.6$ & $7.5$ & $36.5$ & $68.7$ & $83.7$ & $91.3$ \\
PickScore-v1$^\diamondsuit$ & $58.8$ & $64.6$ & \bf 37.2 & $42.2$ & $83.8$ & $89.6$ & $31.0$ & $66.5$ & $81.1$ \\
HPS-v2.1$^\diamondsuit$ & $47.3$ & \bf 70.1 & $18.8$ & $41.3$ & $67.3$ & $93.5$ & $55.0$ & $77.9$ & $87.6$ \\
ImageReward$^\diamondsuit$ & $50.9$ & $64.7$ & $24.9$ & $38.7$ & $63.5$ & $81.8$ & $40.9$ & $73.7$ & $85.3$ \\
Aesthetics$^\diamondsuit$ & $32.4$ & $52.7$ & $27.0$ & $53.6$ & $69.6$ & $92.5$ & $61.4$ & $85.7$ & $92.1$ \\
\midrule
LLaVA-1.5-7b$^\heartsuit$ & $22.0$ & $50.8$ & $24.8$ & $50.2$ & $12.4$ & $51.6$ & 83.7 & 70.4 & 88.7 \\
LLaVA-1.5-13b$^\heartsuit$ & $10.3$ & $51.9$ & $30.7$ & $60.7$ & $23.3$ & $61.2$ & 69.7 & 74.3 & 88.6 \\
LLaVA-1.6-mistral-7b$^\heartsuit$ & $31.3$ & $62.7$ & $15.2$ & $40.9$ & $45.8$ & $73.2$ & 69.9 & 64.3 & 85.4 \\
LLaVA-1.6-vicuna-13b$^\heartsuit$ & $29.1$ & $60.3$ & $27.9$ & $45.6$ & $36.8$ & $62.5$ & 56.3 & 64.0 & 82.7 \\
Instructblip-7b$^\heartsuit$ & $17.1$ & $49.8$ & $26.4$ & $46.9$ & $25.2$ & $64.1$ & 53.1 & 80.8 & 91.2 \\
MiniGPT4-v2$^\heartsuit$ & $32.8$ & $51.2$ & $25.7$ & $60.1$ & $36.7$ & $47.8$ & 32.6 & 67.0 & 83.3 \\
Prometheus-Vision-7b$^\heartsuit$ & $18.8$ & $63.9$ & $7.1$ & $58.8$ & $23.4$ & $67.7$ & 49.5 & 43.4 & 74.4 \\
Prometheus-Vision-13b$^\heartsuit$ & $11.8$ & $64.3$ & $3.6$ & $71.4$ & $8.7$ & $67.9$ & 66.3 & 46.3 & 76.8 \\
% Qwen-VL-Chat$^\spadesuit$ & $31.1$ & $31.6$ & $6.8$ & $7.1$ & $5.7$ & $7.1$ & 71.9 & 62.8 & 86.2 \\
% Internvl-chat-v1-5$^\spadesuit$ & $75.8$ & $77.6$ & $5.9$ & $6.0$ & $91.8$ & $92.7$ & 25.4 & 69.6 & 84.3 \\
% Idefics2-8b$^\spadesuit$ & $32.6$ & $43.5$ & $13.7$ & $52.0$ & $49.0$ & $74.7$ & 42.1 & 58.7 & 79.4 \\
Qwen-VL-Chat$^\spadesuit$ & $52.1$ & $31.6$ & $26.8$ & $7.1$ & $23.6$ & $24.6$ & 71.9 & 62.8 & 86.2 \\
Internvl-chat-v1-5$^\spadesuit$ & $55.3$ & $67.6$ & $6.3$ & $60.0$ & $66.3$ & $65.1$ & 25.4 & 69.6 & 84.3 \\
Idefics2-8b$^\spadesuit$ & $32.6$ & $43.5$ & $13.6$ & $52.0$ & $46.1$ & $68.9$ & 42.1 & 58.7 & 79.4 \\
\midrule
GPT-4-vision$^\clubsuit$ & $66.1$ & $67.0$ & $26.5$ & $97.6$ & $90.4$ & $96.5$ & \bf 79.0 & 80.4 & \bf 93.2 \\
GPT-4o$^\clubsuit$ & $61.5$ & $62.5$ & $35.3$ & \bf 100.0 & \bf 97.6 & \bf 98.7 & 65.8 & \bf 82.5 & 92.8 \\
Gemini Ultra$^\clubsuit$ & \bf 67.2 & $69.0$ & $13.1$ & $95.1$ & $55.7$ & $96.7$ & 55.6 & 75.3 & 88.6 \\
Claude 3 Opus$^\clubsuit$ & $57.1$ & $55.9$ & $13.4$ & $78.9$ & $11.9$ & $70.4$ & 57.7 & 65.6 & 85.0 \\
% \midrule
% Random & 33.3 & 50.0 & 33.3 & 50.0 & 33.3 & 50.0 & 33.3 & 50.0 & 50.0 \\
\bottomrule
\end{tabular}%
\vspace{-0.2cm}
}
\label{exp:main_result}
\end{table}