Leaderboard / evals /mjbench /latex_reults /artifact_number_5.tex
yichao's picture
update mj-bench
b650828
raw
history blame
2.68 kB
\begin{table}[h]
\centering
\caption{The detailed evaluation result of all multimodal judges on \textbf{quality} perspective. The feedback are provided in numerical scale of range [0, 5]. Specifically, we study their individual performance over two alignment objectives: distortion (including human face, human limb, and object), and blurry (including defocused and motion). The best performance across all models is bolded.}
\resizebox{1.0\linewidth}{!}{%
\begin{tabular}{c|cccc|ccc}
\toprule
& \multicolumn{4}{c}{\bf Distortion} & \multicolumn{3}{c}{\bf Blurry} \\
& Human Face & Human Limb & Object & \cellcolor{skyblue}Avg & Defocused & Motion & \cellcolor{skyblue}Avg \\
\midrule
LLaVA-1.5-7b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 2.90 & 11.3 & \cellcolor{skyblue} 7.80 \\
LLaVA-1.5-13b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 24.9 & 36.9 & \cellcolor{skyblue} 32.9 \\
LLaVA-NeXT-mistral-7b$^\heartsuit$ & 11.2 & 13.9 & 1.00 & \cellcolor{skyblue} 8.70 & 56.3 & 73.2 & \cellcolor{skyblue} 61.1 \\
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 18.3 & 17.9 & 17.0 & \cellcolor{skyblue} 17.7 & 27.7 & 34.3 & \cellcolor{skyblue} 28.8 \\
Instructblip-7b$^\heartsuit$ & 9.50 & 3.30 & 19.0 & \cellcolor{skyblue} 10.6 & 10.0 & 10.2 & \cellcolor{skyblue} 9.60 \\
Prometheus-Vision-7b$^\heartsuit$ & 20.1 & 15.2 & 12.0 & \cellcolor{skyblue} 15.8 & 26.3 & 29.5 & \cellcolor{skyblue} 27.5 \\
Prometheus-Vision-13b$^\heartsuit$ & 7.10 & 5.30 & 7.00 & \cellcolor{skyblue} 6.50 & 9.70 & 11.5 & \cellcolor{skyblue} 10.9 \\
Qwen-VL-Chat$^\spadesuit$ & 24.9 & 21.2 & 7.00 & \cellcolor{skyblue} 17.7 & 18.3 & 19.6 & \cellcolor{skyblue} 18.9 \\
Internvl-chat-v1-5$^\spadesuit$ & 21.9 & 24.5 & 1.00 &\cellcolor{skyblue} 15.8 & \bf 93.7 & 96.6 & \cellcolor{skyblue} \bf 95.7 \\
Idefics2-8b$^\spadesuit$ & 44.4 & 33.1 & 9.0 & \cellcolor{skyblue} 28.8 & 88.3 & 68.6 & \cellcolor{skyblue} 75.9 \\
\midrule
GPT-4-vision$^\clubsuit$ & 86.3 & 54.1 & 79.2 & \cellcolor{skyblue} 72.4 & 90.8 & 93.3 & \cellcolor{skyblue} 91.2 \\
GPT-4o$^\clubsuit$ & \bf 98.6 & \bf 73.5 & \bf 100 & \cellcolor{skyblue} \bf 90.4 & 91.6 & \bf 96.7 & \cellcolor{skyblue} 93.0 \\
Gemini Ultra$^\clubsuit$ & 71.6 & 29.9 & 59.8 & \cellcolor{skyblue} 50.7 & 80.7 & 90.8 & \cellcolor{skyblue} 83.9 \\
Claude 3 Opus$^\clubsuit$ & 21.6 & 16.9 & 9.30 & \cellcolor{skyblue} 16.6 & 85.3 & 93.3 & \cellcolor{skyblue} 87.7 \\
\bottomrule
\end{tabular}%
}
\label{exp:artifact_result_number_5}
\end{table}