yichao's picture
update mj-bench
b650828
raw
history blame
4.33 kB
\begin{table}[t]
\centering
\small
\caption{Performance comparison of multimodal judges w.r.t. different ranges of numerical scale and likert range. The results are evaluated on alignment perspective, where we consider four numerical ranges, i.e. [0, 1], [0, 5], [0, 10], [0, 100]. The best performance across all models is bolded.}
\resizebox{0.7\linewidth}{!}{%
\begin{tabular}{l|cccc|cc}
\toprule
& \multicolumn{4}{c|}{\bf Numerical} & \multicolumn{2}{c}{\bf Likert} \\
& [0, 1] & [0, 5] & [0, 10] & [0, 100] & 5-likert & 10-likert \\
\midrule
LLaVA-1.5-7b$^\heartsuit$ & $15.0$ & $26.7$ & $22.0$ & $18.3$ & $ 5.3$ & $10.3$ \\
LLaVA-1.5-13b$^\heartsuit$ & $ 9.7$ & $12.0$ & $10.3$ & $20.5$ & $ 2.6$ & $ 6.8$ \\
LLaVA-NeXT-mistral-7b$^\heartsuit$ & $20.8$ & $27.1$ & $31.3$ & $29.3$ & $36.0$ & $38.6$ \\
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $18.3$ & $26.7$ & $29.1$ & $17.2$ & $28.7$ & $17.2$ \\
Instructblip-7b$^\heartsuit$ & $15.0$ & $20.9$ & $17.1$ & $17.6$ & $11.9$ & $16.8$ \\
MiniGPT4-v2$^\heartsuit$ & $20.4$ & $28.9$ & $32.8$ & $20.9$ & $16.0$ & $28.7$ \\
Prometheus-Vision-7b$^\heartsuit$ & $3.8 $ & $16.7$ & $18.4$ & $15.7$ & $28.7$ & $31.3$ \\
Prometheus-Vision-13b$^\heartsuit$ & $19.7$ & $11.5$ & $11.8$ & $11.2$ & $11.0$ & $6.9$ \\
\midrule
Qwen-VL-Chat$^\spadesuit$ & $26.7$ & $34.6$ & $31.1$ & $26.9$ & $55.5$ & $30.6$ \\
Internvl-chat-v1-5$^\spadesuit$ & $33.0$ & $27.6$ & $75.8$ & $35.3$ & $73.3$ & $18.9$ \\
Idefics2-8b$^\spadesuit$ & $14.6$ & $16.6$ & $32.6$ & $32.6$ & $41.2$ & $25.6$ \\
\midrule
GPT-4-vision$^\clubsuit$ & $63.2$ & $61.2$ & $66.1$ & \bf 67.2 & $\textbf{60.2}$ & $\textbf{63.0}$ \\
GPT-4o$^\clubsuit$ & \bf 63.9 & $61.3$ & $61.5$ & $62.8$ & $56.3$ & $60.3$ \\
Gemini Ultra$^\clubsuit$ & $59.3$ & $\textbf{67.3}$ & \bf 67.2 & $60.1$ & $51.4$ & $57.8$ \\
Claude 3 Opus$^\clubsuit$ & $60.7$ & $45.5$ & $57.1$ & $49.4$ & $56.1$ & $62.4$ \\
\midrule
\cellcolor{skyblue} Overall & \cellcolor{skyblue}30.3 & \cellcolor{skyblue}32.3 & \cellcolor{skyblue} 37.6 & \cellcolor{skyblue}32.33 & \cellcolor{skyblue}35.6 & \cellcolor{skyblue}31.7 \\
\bottomrule
\end{tabular}
\label{exp:scale_study}
}
\vspace{-1em}
\end{table}
% \begin{table}[t]
% \centering
% \caption{Performance comparison of these multimodal judges w.r.t. different ranges of numerical scale. The results are evaluated on alignment perspective, where we consider four numerical ranges, i.e. [0, 1], [0, 5], [0, 10], and [0, 100]. The best performance across all models is bolded.}
% \resizebox{0.7\linewidth}{!}{%
% \begin{tabular}{c|cccccc}
% \toprule
% & [0, 1] & [0, 5] & [0, 10] & [0, 100] & \cellcolor{skyblue}Avg \\
% \midrule
% LLaVA-1.5-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\
% LLaVA-1.5-13b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\
% LLaVA-NeXT-mistral-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\
% LLaVA-NeXT-vicuna-13b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\
% Instructblip-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
% MiniGPT4-v2$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
% Qwen-VL-Chat$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\
% Internvl-chat-v1-5$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\
% Idefics2-8b$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\
% Prometheus-Vision-13b$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\
% \midrule
% GPT-4-vision$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
% GPT-4o$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
% Gemini Ultra$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
% Claude 3 Opus$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
% \bottomrule
% \end{tabular}}
% \label{exp:scale_study}
% \end{table}