\begin{table}[h] \centering \caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback are provided in the following Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.} \resizebox{0.9\linewidth}{!}{% \begin{tabular}{c|cccccc} \toprule & Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg \\ \midrule % CLIP-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ % BLIP-v2$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ % PickScore-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ % HPS-v2.1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ % ImageReward$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ % Aesthetics$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ % \midrule LLaVA-1.5-7b$^\heartsuit$ & $19.1$ & $17.8$ & $20.5$ & $16.9$ & $25.0$ & \cellcolor{skyblue} $19.2$ \\ LLaVA-1.5-13b$^\heartsuit$ & $22.7$ & $21.3$ & $22.2$ & $15.6 $ & $17.9$ & \cellcolor{skyblue} $21.1$ \\ LLaVA-NeXT-mistral-7b$^\heartsuit$ & $19.1$ & $17.8$ & $16.2$ & $10.4$ & $12.5$ & \cellcolor{skyblue} $16.8$ \\ LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $22.7$ & $21.3$ & $17.1$ & $20.8$ & $16.1$ & \cellcolor{skyblue} $20.7$ \\ Instructblip-7b$^\heartsuit$ & $22.3$ & $20.9$ & $17.1 $ & $15.6$ & $7.10$ & \cellcolor{skyblue} $19.2$ \\ MiniGPT4-v2$^\heartsuit$ & $21.1$ & $27.0$ & $22.2$ & $23.4$ & $23.2$ & \cellcolor{skyblue} $23.5$ \\ Prometheus-Vision-7b$^\heartsuit$ & $21.9$ & $17.4$ & $21.4$ & $18.2$ & $5.40$ & \cellcolor{skyblue} $18.7$ \\ Prometheus-Vision-13b$^\heartsuit$ & $15.1$ & $13.9$ & $12.8$ & $11.5$ & $5.40$ & \cellcolor{skyblue} $13.3$ \\ Qwen-VL-Chat$^\spadesuit$ & $22.7$ & $22.6$ & $22.2$ & $20.8$ & $26.8$ & \cellcolor{skyblue} $22.7$ \\ Internvl-chat-v1-5$^\spadesuit$ & $19.9$ & $17.8$ & $20.5$ & $20.8$ & $26.8$ & \cellcolor{skyblue} $20.0$ \\ Idefics2-8b$^\spadesuit$ & $27.9$ & $24.8$ & $26.5$ & $27.3$ & $28.6$ & \cellcolor{skyblue} $26.7$ \\ \midrule GPT-4-vision$^\clubsuit$ & $46.3$ & $\bf 49.7$ & $39.7$ & $48.6$ & $\bf 50.7$ & \cellcolor{skyblue} $43.$1 \\ GPT-4o$^\clubsuit$ & $\bf 46.6$ & $45.5$ & $\bf 41.9$ & $\bf 53.0$ & $50.0$ & \cellcolor{skyblue} $\bf 47.2$ \\ Gemini Ultra$^\clubsuit$ & $27.9$ & $29.4$ & $20.2$ & $35.7$ & $29.5$ & \cellcolor{skyblue} $31.9$ \\ Claude 3 Opus$^\clubsuit$ & $28.8$ & $26.3$ & $22.6$ & $35.7$ & $33.0$ & \cellcolor{skyblue} $29.8$ \\ \bottomrule \end{tabular}} \label{exp:alignment_narrative_5} \end{table}