Leaderboard

Running

File size: 3,865 Bytes

b650828


\begin{table}[h]
    \centering
    \caption{The detailed evaluation result of all multimodal judges on \textbf{quality} perspective. The feedback are provided in numerical scale of range [0, 10]. Specifically, we study their individual performance over two alignment objectives: distortion (including human face, human limb, and object), and blurry (including defocused and motion). The best performance across all models is bolded.}
    \resizebox{1.0\linewidth}{!}{%
    \begin{tabular}{c|cccc|ccc}
    \toprule
         & \multicolumn{4}{c}{\bf Distortion} & \multicolumn{3}{c}{\bf Blurry} \\
         & Human Face & Human Limb & Object & \cellcolor{skyblue}Avg  & Defocused & Motion & \cellcolor{skyblue}Avg \\
         \midrule
         CLIP-v1$^\diamondsuit$ & $26.6$ & $17.2$ & $34.0$ & \cellcolor{skyblue} $19.3$ & $50.6$ & $63.7$ & \cellcolor{skyblue} $56.7$ \\
         BLIP-v2$^\diamondsuit$ & $3.60$ & $2.00$ & $1.10$ & \cellcolor{skyblue} $1.90$ & $8.30$ & $47.2$ & \cellcolor{skyblue} $15.0$ \\
         PickScore-v1$^\diamondsuit$ & $83.4$ & $68.2$ & $92.1$ & \cellcolor{skyblue} $79.3$ & $80.6$ & $93.4$ & \cellcolor{skyblue} $86.6$ \\
         HPS-v2.1$^\diamondsuit$ & $60.4$ & $37.1$ & $80.3$ & \cellcolor{skyblue} $51.7$ & $85.7$ & $94.6$ & \cellcolor{skyblue} $88.6$ \\
         ImageReward$^\diamondsuit$ & $31.4$ & $34.4$ & $40.2$ & \cellcolor{skyblue} $33.3$ & $77.4$ & $86.6$ & \cellcolor{skyblue} $82.1$ \\
         Aesthetics$^\diamondsuit$ & $78.7$ & $57.1$ & $51.3$ & \cellcolor{skyblue} $52.1$ & $90.1$ & $93.4$ & \cellcolor{skyblue} $91.6$ \\
         \midrule
         LLaVA-1.5-7b$^\heartsuit$ & $13.6$ & $7.30$ & $9.20$ & \cellcolor{skyblue} $10.2$ & $7.10$ & $19.1$ & \cellcolor{skyblue} $13.1$ \\
         LLaVA-1.5-13b$^\heartsuit$ & $20.1$ & $14.6$ & $13.3$ & \cellcolor{skyblue} $16.4$ & $18.0$ & $34.0$ & \cellcolor{skyblue} $26.1$ \\
         LLaVA-NeXT-7b$^\heartsuit$ & $28.4$ & $27.8$ & $19.0$ & \cellcolor{skyblue} $30.1$ & $41.7$ & $66.1$ & \cellcolor{skyblue} $53.9$ \\
         LLaVA-NeXT-13b$^\heartsuit$ & $18.9$ & $27.8$ & $12.0$ & \cellcolor{skyblue} $20.5$ & $40.6$ & $45.4$ & \cellcolor{skyblue} $43.0$ \\
         Instructblip-7b$^\heartsuit$ & $12.4$  & $9.30$ & $21.0$ & \cellcolor{skyblue} $13.3$ & $32.3$ & $31.1$ & \cellcolor{skyblue} $31.7$ \\
         MiniGPT4-v2$^\heartsuit$ & $39.6$ & $39.1$ & $42.0$ & \cellcolor{skyblue} $40.0$ & $33.4$ & $37.4$ & \cellcolor{skyblue} $35.4$ \\
         Prometheus-Vision-7b$^\heartsuit$ & $16.6$ & $17.9$ & $14.1$ & \cellcolor{skyblue} $16.4$ & $22.3$ & $30.3$ & \cellcolor{skyblue} $26.3$ \\
         Prometheus-Vision-13b$^\heartsuit$ & $7.10$ & $4.60$ & $7.20$ & \cellcolor{skyblue} $6.20$ & $9.40$ &$10.6$ & \cellcolor{skyblue} $10.0$ \\
         Qwen-VL-Chat$^\spadesuit$ & $14.2$ & $15.9$ & $9.40$ & \cellcolor{skyblue} $13.6$ & $0.90$ & $2.10$ & \cellcolor{skyblue} $1.40$ \\
         Internvl-chat-v1-5$^\spadesuit$ & $97.0$ & $\bf 95.4$ & $97.1$ & \cellcolor{skyblue} $\bf 97.1$ & $89.7$ & $89.7$ & \cellcolor{skyblue} $89.7$ \\
         Idefics2-8b$^\spadesuit$ & $29.6$ & $25.8$ & $2.30$ & \cellcolor{skyblue} $21.7$ & $70.6$ & $46.9$ & \cellcolor{skyblue} $58.7$ \\
         \midrule
         GPT-4-vision$^\clubsuit$ & $87.6$ & $57.6$ & $83.1$ & \cellcolor{skyblue} $75.7$ & $98.8$ & $99.3$ & \cellcolor{skyblue} $99.2$ \\
         GPT-4o$^\clubsuit$ & $\bf 99.4$ & $78.2$ & $\bf 100$ & \cellcolor{skyblue} $93.8$ & $\bf 100$ & $\bf 100$ & \cellcolor{skyblue} $\bf 100$ \\
         Gemini Ultra$^\clubsuit$ & $73.4$ & $32.5$ & $61.0$ & \cellcolor{skyblue} $55.7$ & $86.5$ & $97.3$ & \cellcolor{skyblue} $93.9$ \\
         Claude 3 Opus$^\clubsuit$ & $26.6$ & $19.3$ & $10.7$ & \cellcolor{skyblue} $17.6$ & $89.6$ & $93.3$ & \cellcolor{skyblue} $92.7$ \\
    \bottomrule
    \end{tabular}%
    }
    \label{exp:artifact_result_number_10}
\end{table}