\begin{table}[t] \centering \caption{The detailed evaluation result in terms of ACC (accuracy) for all multimodal judges on \textbf{bias} perspective. The feedback is provided in numerical scale with range [0, 10]. Specifically, we separately report the bias w.r.t. different demographic identifications, i.e. age, gender, race, nationality, and religion. The best performance across all models is bolded.} \resizebox{1.0\linewidth}{!}{% \begin{tabular}{c|cccccc} \toprule % & \multicolumn{6}{c}{\bf Occupation} & \multicolumn{4}{c}{\bf Education} \\ & Age & Gender & Race & Nationality & Religion & \cellcolor{skyblue}Avg \\ \midrule CLIP-v1$^\diamondsuit$ & 57.2 & 57.8 & 55.5 & 59.5 & 60.8 & \cellcolor{skyblue} 57.7 \\ BLIP-v2$^\diamondsuit$ & 69.6 & 68.5 & 65.9 & 68.6 & 74.7 & \cellcolor{skyblue} 68.5 \\ PickScore-v1$^\diamondsuit$ & 30.4 & 31.1 & 30.8 & 31.7 & 33.0 & \cellcolor{skyblue} 31.1 \\ HPS-v2.1$^\diamondsuit$ & 52.9 & 55.3 & 55.7 & 55.0 & 62.4 & \cellcolor{skyblue} 55.3 \\ ImageReward$^\diamondsuit$ & 41.8 & 40.4 & 36.8 & 39.5 & 52.8 & \cellcolor{skyblue} 40.4 \\ Aesthetics$^\diamondsuit$ & 59.4 & 62.0 & 64.2 & 62.4 & 61.0 & \cellcolor{skyblue} 62.0 \\ \midrule LLaVA-1.5-7b$^\heartsuit$ & \bf 80.8 & \bf 83.9 & \bf 84.6 & \bf 84.9 & \bf 88.1 & \cellcolor{skyblue} \bf 84.0 \\ LLaVA-1.5-13b$^\heartsuit$ & 67.0 & 70.1 & 68.9 & 72.7 & 75.1 & \cellcolor{skyblue} 70.1 \\ LLaVA-NeXT-mistral-7b$^\heartsuit$ & 71.8 & 70.8 & 70.8 & 67.8 & 78.3 & \cellcolor{skyblue} 70.8 \\ LLaVA-NeXT-vicuna-7b$^\heartsuit$ & 54.3 & 56.7 & 57.0 & 56.1 & 64.8 & \cellcolor{skyblue} 56.6 \\ Instructblip-7b$^\heartsuit$ & 52.5 & 53.6 & 53.6 & 52.0 & 61.1 & \cellcolor{skyblue} 53.6 \\ MiniGPT4-v2$^\heartsuit$ & 31.8 & 32.2 & 31.9 & 34.1 & 28.3 & \cellcolor{skyblue} 32.2 \\ Prometheus-Vision-7b$^\heartsuit$ & 43.8 & 50.4 & 54.4 & 53.6 & 44.9 & \cellcolor{skyblue} 50.4 \\ Prometheus-Vision-13b$^\heartsuit$ & 65.1 & 65.8 & 63.4 & 65.7 & 77.1 & \cellcolor{skyblue} 65.8 \\ Qwen-VL-Chat$^\spadesuit$ & 70.8 & 71.5 & 72.3 & 72.2 & 68.1 & \cellcolor{skyblue} 71.5 \\ Internvl-chat-v1-5$^\spadesuit$ & 40.0 & 41.3 & 42.1 & 42.0 & 39.8 & \cellcolor{skyblue} 41.3 \\ Idefics2-8b$^\spadesuit$ & 37.4 & 42.7 & 45.3 & 46.9 & 35.2 & \cellcolor{skyblue} 42.7 \\ \midrule GPT-4-vision$^\clubsuit$ & 76.7 & 79.1 & 77.4 & 81.0 & 86.5 & \cellcolor{skyblue} 79.1 \\ GPT-4o$^\clubsuit$ & 60.9 & 66.6 & 69.1 & 68.2 & 69.6 & \cellcolor{skyblue} 66.6 \\ Gemini Ultra$^\clubsuit$ & 48.7 & 56.9 & 62.9 & 60.0 & 49.9 & \cellcolor{skyblue} 56.9 \\ Claude 3 Opus$^\clubsuit$ & 53.9 & 58.2 & 62.1 & 59.0 & 54.0 & \cellcolor{skyblue} 58.2 \\ \bottomrule \end{tabular}% } \label{exp:bias_acc} \end{table}