\begin{table}[t] \centering \caption{The detailed evaluation result in terms of Normalized Dispersion Score (NDS) for all multimodal judges on \textbf{bias} perspective. The feedback is provided in numerical scale with range [0, 10]. Specifically, we separately report the bias w.r.t. different demographic identifications, i.e. age, gender, race, nationality, and religion. The best performance across all models is bolded.} \resizebox{1.0\linewidth}{!}{% \begin{tabular}{c|cccccc} \toprule % & \multicolumn{6}{c}{\bf Occupation} & \multicolumn{4}{c}{\bf Education} \\ & Age & Gender & Race & Nationality & Religion & \cellcolor{skyblue}Avg \\ \midrule CLIP-v1$^\diamondsuit$ & 73.6 & 75.2 & 73.1 & 79.1 & 78.4 & \cellcolor{skyblue} 75.2 \\ BLIP-v2$^\diamondsuit$ & 85.3 & 83.6 & 82.7 & 81.8 & 87.5 & \cellcolor{skyblue} 83.6 \\ PickScore-v1$^\diamondsuit$ & 65.3 & 66.7 & 66.4 & 67.3 & 69.4 & \cellcolor{skyblue} 66.7 \\ HPS-v2.1$^\diamondsuit$ & 75.8 & 78.2 & 79.5 & 78.6 & 79.3 & \cellcolor{skyblue} 78.2 \\ ImageReward$^\diamondsuit$ & 73.9 & 73.2 & 70.9 & 73.0 & 80.2 & \cellcolor{skyblue} 73.2 \\ Aesthetics$^\diamondsuit$ & \bf 85.3 & \bf 85.9 & \bf 86.3 & \bf 85.8 & 86.2 & \cellcolor{skyblue} \bf 85.9 \\ \midrule LLaVA-1.5-7b$^\heartsuit$ & 67.6 & 71.4 & 75.8 & 68.4 & 77.3 & \cellcolor{skyblue} 71.4 \\ LLaVA-1.5-13b$^\heartsuit$ & 71.9 & 74.8 & 76.6 & 74.0 & 80.6 & \cellcolor{skyblue} 74.8 \\ LLaVA-NeXT-mistral-7b$^\heartsuit$ & 68.4 & 64.6 & 62.4 & 59.7 & 78.1 & \cellcolor{skyblue} 64.6 \\ LLaVA-NeXT-vicuna-7b$^\heartsuit$ & 63.2 & 64.1 & 62.5 & 63.8 & 74.2 & \cellcolor{skyblue} 64.1\\ Instructblip-7b$^\heartsuit$ & 80.8 & 80.6 & 80.3 & 79.0 & 85.4 & \cellcolor{skyblue} 80.6 \\ MiniGPT4-v2$^\heartsuit$ & 68.1 & 67.2 & 66.2 & 67.0 & 69.3 & \cellcolor{skyblue} 67.2 \\ Prometheus-Vision-7b$^\heartsuit$ & 47.2 & 42.5 & 37.8 & 40.0 & 54.2 & \cellcolor{skyblue} 42.5 \\ Prometheus-Vision-13b$^\heartsuit$ & 54.2 & 44.7 & 36.0 & 39.3 & 65.7 & \cellcolor{skyblue} 44.7 \\ Qwen-VL-Chat$^\spadesuit$ & 62.4 & 62.3 & 62.3 & 63.1 & 58.9 & \cellcolor{skyblue} 62.3 \\ Internvl-chat-v1-5$^\spadesuit$ & 74.0 & 74.1 & 73.6 & 73.9 & 76.6 & \cellcolor{skyblue} 74.1 \\ Idefics2-8b$^\spadesuit$ & 55.1 & 59.2 & 61.7 & 62.8 & 51.0 & \cellcolor{skyblue} 59.2 \\ \midrule GPT-4-vision$^\clubsuit$ & 81.2 & 80.2 & 77.6 & 79.9 & \bf 88.2 & \cellcolor{skyblue} 80.2 \\ GPT-4o$^\clubsuit$ & 81.2 & 82.7 & 82.8 & 83.2 & 86.1 & \cellcolor{skyblue} 82.7 \\ Gemini Ultra$^\clubsuit$ & 72.6 & 75.8 & 78.4 & 77.0 & 72.3 & \cellcolor{skyblue} 75.8 \\ Claude 3 Opus$^\clubsuit$ & 63.3 & 66.1 & 67.5 & 66.9 & 66.8 & \cellcolor{skyblue} 66.1 \\ \bottomrule \end{tabular}% } \label{exp:bias_nds} \end{table}