Leaderboard

Running

File size: 2,888 Bytes

b650828


\begin{table}[t]
    \centering
    \caption{The detailed evaluation result of all multimodal judges on \textbf{safety} perspective. The feedback is provided in numerical scale of range [0, 5]. Specifically, we study their individual performance over two alignment objectives: toxicity (crime, shocking, and disgust) and NSFW (evident, evasive, and subtle). The best performance across all models is bolded.}
    \resizebox{1.0\linewidth}{!}{%
    \begin{tabular}{c|cccc|cccc}
    \toprule
         & \multicolumn{4}{c}{\bf Toxicity} & \multicolumn{4}{c}{\bf NSFW} \\
         & Crime & Shocking & Disgust & \cellcolor{skyblue}Avg & Evident & Evasive & Subtle & \cellcolor{skyblue}Avg \\
         \midrule
         LLaVA-1.5-7b$^\heartsuit$ & $10.3$ & $20.7$ & $19.0$ & \cellcolor{skyblue} $15.7$ & 13.5 & 11.2 & 5.10 & \cellcolor{skyblue} 7.60 \\
         LLaVA-1.5-13b$^\heartsuit$ & $13.8$ & $10.3$ & $23.8$ & \cellcolor{skyblue} $16.9$ & 16.9 & 11.2 & 8.90 & \cellcolor{skyblue} 12.7 \\
         LLaVA-NeXT-mistral-7b$^\heartsuit$ & $20.7$ & $17.2$ & $16.7$ & \cellcolor{skyblue} $16.9$ & 15.6 & 8.70 & 5.30 & \cellcolor{skyblue} 9.30 \\
         LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $31.0$ & $27.6$ & $31.0$ & \cellcolor{skyblue} $27.0$ & 19.2 & 14.3 & 10.7 & \cellcolor{skyblue} 15.5 \\
         Instructblip-7b$^\heartsuit$ & $20.7$ & $31.0$ & $16.7$ & \cellcolor{skyblue} $24.7$ & 16.8 & 12.4 & 5.60 & \cellcolor{skyblue} 13.0 \\
          Prometheus-Vision-7b$^\heartsuit$ & $6.90$ & $0.00$ & $7.10$ & \cellcolor{skyblue} $4.50$ & 10.9 & 4.30 & 2.10 & \cellcolor{skyblue} 5.90 \\
          Prometheus-Vision-13b$^\heartsuit$ & $0.00$ & $0.00$ & $0.00$ & \cellcolor{skyblue} $0.00$ & 9.30 & 2.50 & 1.30 & \cellcolor{skyblue} 4.90 \\
         Qwen-VL-Chat$^\spadesuit$ & $31.0$ & $34.5$ & $21.4$ & \cellcolor{skyblue} $30.3$ & 31.6 & 24.9 & 16.3 & \cellcolor{skyblue} 25.3 \\
         Internvl-chat-v1-5$^\spadesuit$ & $24.1$ & $6.90$ & $23.8$ & \cellcolor{skyblue} $19.1$ & 19.5 & 10.3 & 6.80 & \cellcolor{skyblue} 13.0 \\
         Idefics2-8b$^\spadesuit$ & $44.8$ & $41.4$ & $54.8$ & \cellcolor{skyblue} $47.2$ & 29.1 & 10.6 & 8.60 & \cellcolor{skyblue} 16.8 \\
         \midrule
         GPT-4-vision$^\clubsuit$ & $69.0$ & $72.4$ & $73.8$ & \cellcolor{skyblue} $70.8$ & 63.5 & 49.6 & 33.8 & \cellcolor{skyblue} $52.3$ \\
         GPT-4o$^\clubsuit$ & $\bf 75.9$ & $\bf 82.8$ & $\bf 92.9$ & \cellcolor{skyblue} $\bf 84.3$ & $\bf 70.1$ & $\bf 50.6$ & $\bf 36.2$ & \cellcolor{skyblue} $\bf 54.3$ \\
         Gemini Ultra$^\clubsuit$ & $48.3$ & $69.0$ & $73.8$ & \cellcolor{skyblue} $65.2$ & 53.9 & 45.2 & 31.2 & \cellcolor{skyblue} $47.7$ \\
         Claude 3 Opus$^\clubsuit$ & $13.8$ & $6.90$ & $7.10$ & \cellcolor{skyblue} $10.1$ & 45.9 & 32.6 & 26.8 & \cellcolor{skyblue} $38.3$ \\
    \bottomrule
    \end{tabular}%
    }
    \label{exp:safety_result_number_5}
\end{table}