\begin{table}[t] \centering \caption{The detailed evaluation result in terms of Gini-based Equality Score (GES) for all multimodal judges on \textbf{bias} perspective. The feedback is provided in numerical scale with range [0, 10]. Specifically, we separately report the bias w.r.t. different demographic identifications, i.e. age, gender, race, nationality, and religion. The best performance across all models is bolded.} \resizebox{1.0\linewidth}{!}{% \begin{tabular}{c|cccccc} \toprule % & \multicolumn{6}{c}{\bf Occupation} & \multicolumn{4}{c}{\bf Education} \\ & Age & Gender & Race & Nationality & Religion & \cellcolor{skyblue}Avg \\ \midrule CLIP-v1$^\diamondsuit$ & 73.6 & 75.2 & 73.1 & 79.1 & 78.4 & \cellcolor{skyblue} 75.2 \\ BLIP-v2$^\diamondsuit$ & 92.2 & 91.3 & 90.7 & 90.4 & 93.1 & \cellcolor{skyblue} 91.3 \\ PickScore-v1$^\diamondsuit$ & 80.5 & 81.2 & 81.0 & 81.6 & 82.6 & \cellcolor{skyblue} 81.2 \\ HPS-v2.1$^\diamondsuit$ & 86.4 & 87.8 & 88.5 & 88.0 & 88.5 & \cellcolor{skyblue} 87.8 \\ ImageReward$^\diamondsuit$ & 85.5 & 85.0 & 83.6 & 84.8 & 89.0 & \cellcolor{skyblue} 85.0 \\ Aesthetics$^\diamondsuit$ & 91.9 & 92.1 & 92.4 & 92.1 & 92.3 & \cellcolor{skyblue} 92.1 \\ \midrule LLaVA-1.5-7b$^\heartsuit$ & 87.4 & 88.9 & 90.1 & 88.7 & 90.7 & \cellcolor{skyblue} 88.9 \\ LLaVA-1.5-13b$^\heartsuit$ & 87.5 & 88.8 & 88.9 & 89.5 & 90.1 & \cellcolor{skyblue} 88.8 \\ LLaVA-NeXT-mistral-7b$^\heartsuit$ & 86.4 & 85.8 & 85.8 & 84.1 & 90.2 & \cellcolor{skyblue} 85.8 \\ LLaVA-NeXT-vicuna-7b$^\heartsuit$ & 82.1 & 82.8 & 82.4 & 82.5 & 87.8 & \cellcolor{skyblue} 82.8\\ Instructblip-7b$^\heartsuit$ & 91.0 & 91.2 & 91.1 & 90.4 & 93.8 & \cellcolor{skyblue} 91.1 \\ MiniGPT4-v2$^\heartsuit$ & 83.7 & 83.3 & 82.8 & 83.4 & 84.1 & \cellcolor{skyblue} 83.3 \\ Prometheus-Vision-7b$^\heartsuit$ & 74.9 & 74.3 & 73.1 & 74.2 & 77.3 & \cellcolor{skyblue} 74.3 \\ Prometheus-Vision-13b$^\heartsuit$ & 79.2 & 76.0 & 72.7 & 74.1 & 85.1 & \cellcolor{skyblue} 76.0 \\ Qwen-VL-Chat$^\spadesuit$ & 85.9 & 86.0 & 86.0 & 86.4 & 83.8 & \cellcolor{skyblue} 85.9 \\ Internvl-chat-v1-5$^\spadesuit$ & 86.9 & 87.2 & 87.1 & 87.3 & 88.0 & \cellcolor{skyblue} 87.2 \\ Idefics2-8b$^\spadesuit$ & 77.0 & 79.7 & 81.3 & 82.0 & 74.4 & \cellcolor{skyblue} 79.8 \\ \midrule GPT-4-vision$^\clubsuit$ & \bf 93.0 & \bf 93.2 & 92.2 & \bf 93.4 & \bf 96.4 & \cellcolor{skyblue} \bf 93.2 \\ GPT-4o$^\clubsuit$ & 91.8 & 92.9 & \bf 93.1 & 93.3 & 94.4 & \cellcolor{skyblue} 92.9 \\ Gemini Ultra$^\clubsuit$ & 86.6 & 89.0 & 90.8 & 90.0 & 86.2 & \cellcolor{skyblue} 89.0 \\ Claude 3 Opus$^\clubsuit$ & 83.2 & 85.2 & 86.5 & 85.8 & 84.8 & \cellcolor{skyblue} 85.2 \\ \bottomrule \end{tabular}% } \label{exp:bias_ges} \end{table}