yichao's picture
update mj-bench
b650828
raw
history blame
2.97 kB
\begin{table}[t]
\centering
\caption{The detailed evaluation result in terms of Gini-based Equality Score (GES) for all multimodal judges on \textbf{bias} perspective. The feedback is provided in numerical scale with range [0, 10]. Specifically, we separately report the bias w.r.t. different demographic identifications, i.e. age, gender, race, nationality, and religion. The best performance across all models is bolded.}
\resizebox{1.0\linewidth}{!}{%
\begin{tabular}{c|cccccc}
\toprule
% & \multicolumn{6}{c}{\bf Occupation} & \multicolumn{4}{c}{\bf Education} \\
& Age & Gender & Race & Nationality & Religion & \cellcolor{skyblue}Avg \\
\midrule
CLIP-v1$^\diamondsuit$ & 73.6 & 75.2 & 73.1 & 79.1 & 78.4 & \cellcolor{skyblue} 75.2 \\
BLIP-v2$^\diamondsuit$ & 92.2 & 91.3 & 90.7 & 90.4 & 93.1 & \cellcolor{skyblue} 91.3 \\
PickScore-v1$^\diamondsuit$ & 80.5 & 81.2 & 81.0 & 81.6 & 82.6 & \cellcolor{skyblue} 81.2 \\
HPS-v2.1$^\diamondsuit$ & 86.4 & 87.8 & 88.5 & 88.0 & 88.5 & \cellcolor{skyblue} 87.8 \\
ImageReward$^\diamondsuit$ & 85.5 & 85.0 & 83.6 & 84.8 & 89.0 & \cellcolor{skyblue} 85.0 \\
Aesthetics$^\diamondsuit$ & 91.9 & 92.1 & 92.4 & 92.1 & 92.3 & \cellcolor{skyblue} 92.1 \\
\midrule
LLaVA-1.5-7b$^\heartsuit$ & 87.4 & 88.9 & 90.1 & 88.7 & 90.7 & \cellcolor{skyblue} 88.9 \\
LLaVA-1.5-13b$^\heartsuit$ & 87.5 & 88.8 & 88.9 & 89.5 & 90.1 & \cellcolor{skyblue} 88.8 \\
LLaVA-NeXT-mistral-7b$^\heartsuit$ & 86.4 & 85.8 & 85.8 & 84.1 & 90.2 & \cellcolor{skyblue} 85.8 \\
LLaVA-NeXT-vicuna-7b$^\heartsuit$ & 82.1 & 82.8 & 82.4 & 82.5 & 87.8 & \cellcolor{skyblue} 82.8\\
Instructblip-7b$^\heartsuit$ & 91.0 & 91.2 & 91.1 & 90.4 & 93.8 & \cellcolor{skyblue} 91.1 \\
MiniGPT4-v2$^\heartsuit$ & 83.7 & 83.3 & 82.8 & 83.4 & 84.1 & \cellcolor{skyblue} 83.3 \\
Prometheus-Vision-7b$^\heartsuit$ & 74.9 & 74.3 & 73.1 & 74.2 & 77.3 & \cellcolor{skyblue} 74.3 \\
Prometheus-Vision-13b$^\heartsuit$ & 79.2 & 76.0 & 72.7 & 74.1 & 85.1 & \cellcolor{skyblue} 76.0 \\
Qwen-VL-Chat$^\spadesuit$ & 85.9 & 86.0 & 86.0 & 86.4 & 83.8 & \cellcolor{skyblue} 85.9 \\
Internvl-chat-v1-5$^\spadesuit$ & 86.9 & 87.2 & 87.1 & 87.3 & 88.0 & \cellcolor{skyblue} 87.2 \\
Idefics2-8b$^\spadesuit$ & 77.0 & 79.7 & 81.3 & 82.0 & 74.4 & \cellcolor{skyblue} 79.8 \\
\midrule
GPT-4-vision$^\clubsuit$ & \bf 93.0 & \bf 93.2 & 92.2 & \bf 93.4 & \bf 96.4 & \cellcolor{skyblue} \bf 93.2 \\
GPT-4o$^\clubsuit$ & 91.8 & 92.9 & \bf 93.1 & 93.3 & 94.4 & \cellcolor{skyblue} 92.9 \\
Gemini Ultra$^\clubsuit$ & 86.6 & 89.0 & 90.8 & 90.0 & 86.2 & \cellcolor{skyblue} 89.0 \\
Claude 3 Opus$^\clubsuit$ & 83.2 & 85.2 & 86.5 & 85.8 & 84.8 & \cellcolor{skyblue} 85.2 \\
\bottomrule
\end{tabular}%
}
\label{exp:bias_ges}
\end{table}