Leaderboard

Running

File size: 2,968 Bytes

b650828

\begin{table}[t]
    \centering
    \caption{The detailed evaluation result of all multimodal judges on \textbf{safety} perspective. The feedback is provided in the following Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. Specifically, we study their individual performance over two alignment objectives: toxicity (crime, shocking, and disgust) and NSFW (evident, evasive, and subtle). The best performance across all models is bolded.}
    \resizebox{1.0\linewidth}{!}{%
    \begin{tabular}{c|cccc|cccc}
    \toprule
         & \multicolumn{4}{c}{\bf Toxicity} & \multicolumn{4}{c}{\bf NSFW} \\
         & Crime & Shocking & Disgust & \cellcolor{skyblue}Avg & Evident & Evasive & Subtle & \cellcolor{skyblue}Avg \\
         \midrule
         LLaVA-1.5-7b$^\heartsuit$ & $10.3$ & $31.0$ & $26.2$ & \cellcolor{skyblue} $20.2$ & 14.2 & 9.90 & 6.80 & \cellcolor{skyblue} 9.70 \\
         LLaVA-1.5-13b$^\heartsuit$ & $13.8$ & $24.1$ & $23.8$ & \cellcolor{skyblue} $18.0$ & 16.9 & 10.5 & 9.60 & \cellcolor{skyblue} 15.6 \\
         LLaVA-NeXT-mistral-7b$^\heartsuit$ & $27.6$ & $17.2$ & $21.4$ & \cellcolor{skyblue} $21.3$ & 26.9 & 9.30 & 6.70 & \cellcolor{skyblue} 19.5 \\
         LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $34.5$ & $27.6$ & $40.5$ & \cellcolor{skyblue} $32.6$ & 26.8 & 13.9 & 11.5 & \cellcolor{skyblue} 19.7 \\
         Instructblip-7b$^\heartsuit$ & $34.5$ & $20.7$ & $31.0$ & \cellcolor{skyblue} $29.2$ & 23.9 & 12.6 & 5.90 & \cellcolor{skyblue} 16.8 \\
          Prometheus-Vision-7b$^\heartsuit$ & $27.6$ & $20.7$ & $28.6$ & \cellcolor{skyblue} $24.7$ & 10.4 & 4.90 & 2.70 & \cellcolor{skyblue} 25.6 \\
          Prometheus-Vision-13b$^\heartsuit$ & $0.00$ & $0.00$ & $4.80$ & \cellcolor{skyblue} $2.20$ & 9.80 & 3.00 & 1.50 & \cellcolor{skyblue} 5.60 \\
         Qwen-VL-Chat$^\spadesuit$ & $34.5$ & $41.4$ & $42.9$ & \cellcolor{skyblue} $38.2$ & 32.2 & 24.0 & 16.6 & \cellcolor{skyblue} 30.1 \\
         Internvl-chat-v1-5$^\spadesuit$ & $0.00$ & $3.40$ & $2.40$ & \cellcolor{skyblue} $2.20$ & 2.80 & 1.00 & 0.70 & \cellcolor{skyblue} 1.30 \\
         Idefics2-8b$^\spadesuit$ & $37.9$ & $10.3$ & $38.1$ & \cellcolor{skyblue} $29.2$ & 20.2 & 10.0 & 7.10 & \cellcolor{skyblue} 16.7 \\
         \midrule
         GPT-4-vision$^\clubsuit$ & $10.3$ & $24.1$ & $31.0$ & \cellcolor{skyblue} $22.5$ & 64.0 & 50.1 & 34.4 & \cellcolor{skyblue} \bf 54.4 \\
         GPT-4o$^\clubsuit$ & $34.5$ & $\bf 48.3$ & $50.0$ & \cellcolor{skyblue} $46.1$ & \bf 69.6 & \bf 50.9 & \bf 35.9 & \cellcolor{skyblue} 50.3 \\
         Gemini Ultra$^\clubsuit$ & $\bf 41.4$ & $44.8$ & $\bf 66.7$ & \cellcolor{skyblue} $\bf 52.8$ & 53.5 & 45.6 & 31.9 & \cellcolor{skyblue} 51.5 \\
         Claude 3 Opus$^\clubsuit$ & $10.3$ & $3.40$ & $4.80$ & \cellcolor{skyblue} $5.60$ & 45.6 & 32.4 & 27.0 & \cellcolor{skyblue} 35.2 \\
    \bottomrule
    \end{tabular}%
    }
    \label{exp:safety_result_narrative_5}
\end{table}