\begin{table}[t]
    \centering
    \caption{The detailed evaluation result in terms of ACC (accuracy) for all multimodal judges on \textbf{bias} perspective. The feedback is provided in numerical scale with range [0, 10]. Specifically, we separately report the bias w.r.t. different demographic identifications, i.e. age, gender, race, nationality, and religion. The best performance across all models is bolded.}
    \resizebox{1.0\linewidth}{!}{%
    \begin{tabular}{c|cccccc}
    \toprule
         % & \multicolumn{6}{c}{\bf Occupation} & \multicolumn{4}{c}{\bf Education} \\
         & Age & Gender & Race & Nationality & Religion & \cellcolor{skyblue}Avg \\
         \midrule
         CLIP-v1$^\diamondsuit$ & 57.2 & 57.8 & 55.5 & 59.5 & 60.8 & \cellcolor{skyblue} 57.7  \\
         BLIP-v2$^\diamondsuit$ & 69.6 & 68.5 & 65.9 & 68.6 & 74.7 & \cellcolor{skyblue} 68.5 \\
         PickScore-v1$^\diamondsuit$ & 30.4 & 31.1 & 30.8 & 31.7 & 33.0 & \cellcolor{skyblue} 31.1 \\
         HPS-v2.1$^\diamondsuit$ & 52.9 & 55.3 & 55.7 & 55.0 & 62.4 & \cellcolor{skyblue} 55.3  \\
         ImageReward$^\diamondsuit$ & 41.8 & 40.4 & 36.8 & 39.5 & 52.8 & \cellcolor{skyblue} 40.4 \\
         Aesthetics$^\diamondsuit$ & 59.4 & 62.0 & 64.2 & 62.4 & 61.0 & \cellcolor{skyblue} 62.0  \\
         \midrule
         LLaVA-1.5-7b$^\heartsuit$ & \bf 80.8 & \bf 83.9 & \bf 84.6 & \bf 84.9 & \bf 88.1 & \cellcolor{skyblue} \bf 84.0 \\
         LLaVA-1.5-13b$^\heartsuit$ & 67.0 & 70.1 & 68.9 & 72.7 & 75.1 & \cellcolor{skyblue} 70.1 \\
         LLaVA-NeXT-mistral-7b$^\heartsuit$ & 71.8 & 70.8 & 70.8 & 67.8 & 78.3 & \cellcolor{skyblue} 70.8 \\
         LLaVA-NeXT-vicuna-7b$^\heartsuit$ & 54.3 & 56.7 & 57.0 & 56.1 & 64.8 & \cellcolor{skyblue} 56.6 \\
         Instructblip-7b$^\heartsuit$ & 52.5 & 53.6 & 53.6 & 52.0 & 61.1 & \cellcolor{skyblue} 53.6 \\
         MiniGPT4-v2$^\heartsuit$ & 31.8 & 32.2 & 31.9 & 34.1 & 28.3 & \cellcolor{skyblue} 32.2 \\
         Prometheus-Vision-7b$^\heartsuit$ & 43.8 & 50.4 & 54.4 & 53.6 & 44.9 & \cellcolor{skyblue} 50.4 \\
         Prometheus-Vision-13b$^\heartsuit$ & 65.1 & 65.8 & 63.4 & 65.7 & 77.1 & \cellcolor{skyblue} 65.8 \\
         Qwen-VL-Chat$^\spadesuit$ & 70.8 & 71.5 & 72.3 & 72.2 & 68.1 & \cellcolor{skyblue} 71.5 \\
         Internvl-chat-v1-5$^\spadesuit$ & 40.0 & 41.3 & 42.1 & 42.0 & 39.8 & \cellcolor{skyblue} 41.3 \\
         Idefics2-8b$^\spadesuit$ & 37.4 & 42.7 & 45.3 & 46.9 & 35.2 & \cellcolor{skyblue} 42.7 \\
         \midrule
         GPT-4-vision$^\clubsuit$ & 76.7 & 79.1 & 77.4 & 81.0 & 86.5 & \cellcolor{skyblue} 79.1 \\
         GPT-4o$^\clubsuit$ & 60.9 & 66.6 & 69.1 & 68.2 & 69.6 & \cellcolor{skyblue} 66.6 \\
         Gemini Ultra$^\clubsuit$ & 48.7 & 56.9 & 62.9 & 60.0 & 49.9 & \cellcolor{skyblue} 56.9 \\
         Claude 3 Opus$^\clubsuit$ & 53.9 & 58.2 & 62.1 & 59.0 & 54.0 & \cellcolor{skyblue} 58.2 \\
    \bottomrule
    \end{tabular}%
    }
    \label{exp:bias_acc}
\end{table}