Leaderboard

Running

App Files Files Community

Leaderboard / evals /mjbench /latex_reults /alignment_number_5.tex

yichao

update mj-bench

b650828 4 months ago

raw

history blame

2.77 kB

	\begin{table}[h]
	\centering
	\caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback is provided in the numerical scale of range [0, 5]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.}
	\resizebox{0.9\linewidth}{!}{%
	\begin{tabular}{c\|cccccc}
	\toprule
	& Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg \\
	\midrule
	% CLIP-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
	% BLIP-v2$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
	% PickScore-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
	% HPS-v2.1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
	% ImageReward$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
	% Aesthetics$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
	% \midrule
	LLaVA-1.5-7b$^\heartsuit$ & 27.1 & 25.7 & 28.2 & 26.0 & 26.8 & \cellcolor{skyblue} 26.8 \\
	LLaVA-1.5-13b$^\heartsuit$ & 11.2 & 14.5 & 12.8 & 7.80 & 14.3 & \cellcolor{skyblue} 12.1 \\
	LLaVA-NeXT-mistral-7b$^\heartsuit$ & 27.9 & 28.3 & 29.1 & 24.7 & 25.0 & \cellcolor{skyblue} 27.0 \\
	LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 28.7 & 21.3 & 31.6 & 28.6 & 26.8 & \cellcolor{skyblue} 27.4 \\
	Instructblip-7b$^\heartsuit$ & 19.9 & 20.9 & 25.6 & 18.2 & 19.6 & \cellcolor{skyblue} 20.8 \\
	MiniGPT4-v2$^\heartsuit$ & 27.5 & 26.1 & 32.5 & 37.7 & 26.8 & \cellcolor{skyblue} 30.1 \\
	Prometheus-Vision-7b$^\heartsuit$ & 18.7 & 13.5 & 14.5 & 19.5 & 25.0 & \cellcolor{skyblue} 18.2 \\
	Prometheus-Vision-13b$^\heartsuit$ & 12.4 & 11.3 & 9.4 & 11.7 & 12.5 & \cellcolor{skyblue} 11.5 \\
	Qwen-VL-Chat$^\spadesuit$ & 30.3 & 34.8 & 39.3 & 40.3 & 35.7 & \cellcolor{skyblue} 36.1 \\
	Internvl-chat-v1-5$^\spadesuit$ & 24.7 & 28.7 & 25.6 & 29.9 & 37.5 & \cellcolor{skyblue} 29.3 \\
	Idefics2-8b$^\spadesuit$ & 17.1 & 17.0 & 13.5 & 14.3 & 19.6 & \cellcolor{skyblue} 16.3 \\
	\midrule
	GPT-4-vision$^\clubsuit$ & \bf 45.3 & \bf 46.3 & 41.3 & 48.3 & 48.3 & \cellcolor{skyblue} 45.9 \\
	GPT-4o$^\clubsuit$ & 44.2 & 45.3 & \bf 43.3 & \bf 53.4 & \bf 51.3 & \cellcolor{skyblue} \bf 48.6 \\
	Gemini Ultra$^\clubsuit$ & 31.7 & 29.7 & 23.7 & 39.7 & 32.7 & \cellcolor{skyblue} 29.9 \\
	Claude 3 Opus$^\clubsuit$ & 24.9 & 28.9 & 25.9 & 31.2 & 29.2 & \cellcolor{skyblue} 26.3 \\
	\bottomrule
	\end{tabular}}
	\label{exp:alignment_number_5}
	\end{table}