\begin{table}[t] \centering \small \caption{Performance comparison of multimodal judges w.r.t. different ranges of numerical scale and likert range. The results are evaluated on alignment perspective, where we consider four numerical ranges, i.e. [0, 1], [0, 5], [0, 10], [0, 100]. The best performance across all models is bolded.} \resizebox{0.7\linewidth}{!}{% \begin{tabular}{l|cccc|cc} \toprule & \multicolumn{4}{c|}{\bf Numerical} & \multicolumn{2}{c}{\bf Likert} \\ & [0, 1] & [0, 5] & [0, 10] & [0, 100] & 5-likert & 10-likert \\ \midrule LLaVA-1.5-7b$^\heartsuit$ & $15.0$ & $26.7$ & $22.0$ & $18.3$ & $ 5.3$ & $10.3$ \\ LLaVA-1.5-13b$^\heartsuit$ & $ 9.7$ & $12.0$ & $10.3$ & $20.5$ & $ 2.6$ & $ 6.8$ \\ LLaVA-NeXT-mistral-7b$^\heartsuit$ & $20.8$ & $27.1$ & $31.3$ & $29.3$ & $36.0$ & $38.6$ \\ LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $18.3$ & $26.7$ & $29.1$ & $17.2$ & $28.7$ & $17.2$ \\ Instructblip-7b$^\heartsuit$ & $15.0$ & $20.9$ & $17.1$ & $17.6$ & $11.9$ & $16.8$ \\ MiniGPT4-v2$^\heartsuit$ & $20.4$ & $28.9$ & $32.8$ & $20.9$ & $16.0$ & $28.7$ \\ Prometheus-Vision-7b$^\heartsuit$ & $3.8 $ & $16.7$ & $18.4$ & $15.7$ & $28.7$ & $31.3$ \\ Prometheus-Vision-13b$^\heartsuit$ & $19.7$ & $11.5$ & $11.8$ & $11.2$ & $11.0$ & $6.9$ \\ \midrule Qwen-VL-Chat$^\spadesuit$ & $26.7$ & $34.6$ & $31.1$ & $26.9$ & $55.5$ & $30.6$ \\ Internvl-chat-v1-5$^\spadesuit$ & $33.0$ & $27.6$ & $75.8$ & $35.3$ & $73.3$ & $18.9$ \\ Idefics2-8b$^\spadesuit$ & $14.6$ & $16.6$ & $32.6$ & $32.6$ & $41.2$ & $25.6$ \\ \midrule GPT-4-vision$^\clubsuit$ & $63.2$ & $61.2$ & $66.1$ & \bf 67.2 & $\textbf{60.2}$ & $\textbf{63.0}$ \\ GPT-4o$^\clubsuit$ & \bf 63.9 & $61.3$ & $61.5$ & $62.8$ & $56.3$ & $60.3$ \\ Gemini Ultra$^\clubsuit$ & $59.3$ & $\textbf{67.3}$ & \bf 67.2 & $60.1$ & $51.4$ & $57.8$ \\ Claude 3 Opus$^\clubsuit$ & $60.7$ & $45.5$ & $57.1$ & $49.4$ & $56.1$ & $62.4$ \\ \midrule \cellcolor{skyblue} Overall & \cellcolor{skyblue}30.3 & \cellcolor{skyblue}32.3 & \cellcolor{skyblue} 37.6 & \cellcolor{skyblue}32.33 & \cellcolor{skyblue}35.6 & \cellcolor{skyblue}31.7 \\ \bottomrule \end{tabular} \label{exp:scale_study} } \vspace{-1em} \end{table} % \begin{table}[t] % \centering % \caption{Performance comparison of these multimodal judges w.r.t. different ranges of numerical scale. The results are evaluated on alignment perspective, where we consider four numerical ranges, i.e. [0, 1], [0, 5], [0, 10], and [0, 100]. The best performance across all models is bolded.} % \resizebox{0.7\linewidth}{!}{% % \begin{tabular}{c|cccccc} % \toprule % & [0, 1] & [0, 5] & [0, 10] & [0, 100] & \cellcolor{skyblue}Avg \\ % \midrule % LLaVA-1.5-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\ % LLaVA-1.5-13b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\ % LLaVA-NeXT-mistral-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\ % LLaVA-NeXT-vicuna-13b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\ % Instructblip-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ % MiniGPT4-v2$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ % Qwen-VL-Chat$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\ % Internvl-chat-v1-5$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\ % Idefics2-8b$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\ % Prometheus-Vision-13b$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\ % \midrule % GPT-4-vision$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ % GPT-4o$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ % Gemini Ultra$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ % Claude 3 Opus$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ % \bottomrule % \end{tabular}} % \label{exp:scale_study} % \end{table}