Leaderboard

Running

App Files Files Community

Leaderboard / evals /mjbench /latex_reults /scale_study.tex

yichao

update mj-bench

b650828 4 months ago

raw

history blame

4.33 kB

	\begin{table}[t]
	\centering
	\small
	\caption{Performance comparison of multimodal judges w.r.t. different ranges of numerical scale and likert range. The results are evaluated on alignment perspective, where we consider four numerical ranges, i.e. [0, 1], [0, 5], [0, 10], [0, 100]. The best performance across all models is bolded.}
	\resizebox{0.7\linewidth}{!}{%
	\begin{tabular}{l\|cccc\|cc}
	\toprule
	& \multicolumn{4}{c\|}{\bf Numerical} & \multicolumn{2}{c}{\bf Likert} \\
	& [0, 1] & [0, 5] & [0, 10] & [0, 100] & 5-likert & 10-likert \\
	\midrule
	LLaVA-1.5-7b$^\heartsuit$ & $15.0$ & $26.7$ & $22.0$ & $18.3$ & $ 5.3$ & $10.3$ \\
	LLaVA-1.5-13b$^\heartsuit$ & $ 9.7$ & $12.0$ & $10.3$ & $20.5$ & $ 2.6$ & $ 6.8$ \\
	LLaVA-NeXT-mistral-7b$^\heartsuit$ & $20.8$ & $27.1$ & $31.3$ & $29.3$ & $36.0$ & $38.6$ \\
	LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $18.3$ & $26.7$ & $29.1$ & $17.2$ & $28.7$ & $17.2$ \\
	Instructblip-7b$^\heartsuit$ & $15.0$ & $20.9$ & $17.1$ & $17.6$ & $11.9$ & $16.8$ \\
	MiniGPT4-v2$^\heartsuit$ & $20.4$ & $28.9$ & $32.8$ & $20.9$ & $16.0$ & $28.7$ \\
	Prometheus-Vision-7b$^\heartsuit$ & $3.8 $ & $16.7$ & $18.4$ & $15.7$ & $28.7$ & $31.3$ \\
	Prometheus-Vision-13b$^\heartsuit$ & $19.7$ & $11.5$ & $11.8$ & $11.2$ & $11.0$ & $6.9$ \\
	\midrule
	Qwen-VL-Chat$^\spadesuit$ & $26.7$ & $34.6$ & $31.1$ & $26.9$ & $55.5$ & $30.6$ \\
	Internvl-chat-v1-5$^\spadesuit$ & $33.0$ & $27.6$ & $75.8$ & $35.3$ & $73.3$ & $18.9$ \\
	Idefics2-8b$^\spadesuit$ & $14.6$ & $16.6$ & $32.6$ & $32.6$ & $41.2$ & $25.6$ \\
	\midrule
	GPT-4-vision$^\clubsuit$ & $63.2$ & $61.2$ & $66.1$ & \bf 67.2 & $\textbf{60.2}$ & $\textbf{63.0}$ \\
	GPT-4o$^\clubsuit$ & \bf 63.9 & $61.3$ & $61.5$ & $62.8$ & $56.3$ & $60.3$ \\
	Gemini Ultra$^\clubsuit$ & $59.3$ & $\textbf{67.3}$ & \bf 67.2 & $60.1$ & $51.4$ & $57.8$ \\
	Claude 3 Opus$^\clubsuit$ & $60.7$ & $45.5$ & $57.1$ & $49.4$ & $56.1$ & $62.4$ \\
	\midrule
	\cellcolor{skyblue} Overall & \cellcolor{skyblue}30.3 & \cellcolor{skyblue}32.3 & \cellcolor{skyblue} 37.6 & \cellcolor{skyblue}32.33 & \cellcolor{skyblue}35.6 & \cellcolor{skyblue}31.7 \\
	\bottomrule
	\end{tabular}
	\label{exp:scale_study}
	}
	\vspace{-1em}
	\end{table}

	% \begin{table}[t]
	% \centering
	% \caption{Performance comparison of these multimodal judges w.r.t. different ranges of numerical scale. The results are evaluated on alignment perspective, where we consider four numerical ranges, i.e. [0, 1], [0, 5], [0, 10], and [0, 100]. The best performance across all models is bolded.}
	% \resizebox{0.7\linewidth}{!}{%
	% \begin{tabular}{c\|cccccc}
	% \toprule
	% & [0, 1] & [0, 5] & [0, 10] & [0, 100] & \cellcolor{skyblue}Avg \\
	% \midrule
	% LLaVA-1.5-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\
	% LLaVA-1.5-13b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\
	% LLaVA-NeXT-mistral-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\
	% LLaVA-NeXT-vicuna-13b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\
	% Instructblip-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
	% MiniGPT4-v2$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
	% Qwen-VL-Chat$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\
	% Internvl-chat-v1-5$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\
	% Idefics2-8b$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\
	% Prometheus-Vision-13b$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\
	% \midrule
	% GPT-4-vision$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
	% GPT-4o$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
	% Gemini Ultra$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
	% Claude 3 Opus$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
	% \bottomrule
	% \end{tabular}}
	% \label{exp:scale_study}
	% \end{table}