File size: 2,813 Bytes
b650828
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
\begin{table}[t]
    \centering
    \caption{Human evaluation result on the generated images from six fine-tuned SD-v1.5 model using the feedback from six multimodal judges, i.e. GPT-4o, GPT-4-vision, Gemini Ultra, Claude 3 Opus, Internvl-chat-v1-5, and HPS-v2.1. Specifically, we consider the following four metrics: ranking over fixed seed (\textbf{FR}), ranking over random seed (\textbf{RR}), average ranking (\textbf{AR}), and average voting (\textbf{AV}). The best performance across all models are bolded.}
    \setlength{\tabcolsep}{2pt}
    \renewcommand{\arraystretch}{0.9}
\resizebox{1.0\linewidth}{!}{%
\begin{tabular}{l|cccc|cccc|cccc}
\toprule
     & \multicolumn{4}{c}{\bf Alignment} & \multicolumn{4}{c}{\bf Safety} & \multicolumn{4}{c}{\bf Bias}  \\
     & FR $\downarrow$ & RR $\downarrow$ & \cellcolor{skyblue}{AR $\downarrow$} & \cellcolor{skyblue}{AV $\uparrow$} & FR $\downarrow$ & RR $\downarrow$ & \cellcolor{skyblue}{AR $\downarrow$} & \cellcolor{skyblue}{AV $\uparrow$} & FR $\downarrow$ & RR $\downarrow$ & \cellcolor{skyblue}{AR $\downarrow$} & \cellcolor{skyblue}{AV $\uparrow$} \\
     \midrule
      GPT-4o$^\clubsuit$       & \bf 2.16 & \bf 2.66 & \cellcolor{skyblue}{\bf 2.50} & \cellcolor{skyblue}{\bf 17.21\%} & 1.91 & \bf 1.88 & \cellcolor{skyblue}{\bf 1.89} & \cellcolor{skyblue}{\bf 17.37\%} & \bf 1.72 & \bf 2.48 & \cellcolor{skyblue}{\bf 2.10} & \cellcolor{skyblue}{\bf 21.58\%} \\
      GPT-4-vision$^\clubsuit$ & 2.43 & 2.81 & \cellcolor{skyblue}{2.68} & \cellcolor{skyblue}{15.96\%} & \bf 1.84 & 1.98 & \cellcolor{skyblue}{1.94} & \cellcolor{skyblue}{16.81\%} & 1.99 & 3.14 & \cellcolor{skyblue}{2.57} & \cellcolor{skyblue}{16.80\%} \\
      Gemini Ultra$^\clubsuit$ & \bf 2.15 & 2.72 & \cellcolor{skyblue}{2.54} & \cellcolor{skyblue}{14.87\%} & \bf 1.55 & \bf 1.69 & \cellcolor{skyblue}{\bf 1.64} & \cellcolor{skyblue}{\bf 18.98\%} & 2.23 & \bf 2.65 & \cellcolor{skyblue}{2.44} & \cellcolor{skyblue}{16.18\%} \\
      Claude 3 Opus$^\clubsuit$ & 2.25 & 2.80 & \cellcolor{skyblue}{2.62} & \cellcolor{skyblue}{15.34\%} & 2.07 & 2.12 & \cellcolor{skyblue}{2.10} & \cellcolor{skyblue}{16.15\%} & 2.29 & 3.43 & \cellcolor{skyblue}{2.86} & \cellcolor{skyblue}{11.62\%} \\
      Internvl-chat-v1-5$^\spadesuit$ & 3.16 & 2.99 & \cellcolor{skyblue}{3.05} & \cellcolor{skyblue}{16.90\%} & 2.49 & 2.28 & \cellcolor{skyblue}{2.35} & \cellcolor{skyblue}{15.30\%} & 1.97 & 3.43 & \cellcolor{skyblue}{2.70} & \cellcolor{skyblue}{14.52\%} \\
      HPS-v2.1$^\diamondsuit$ & 2.21 & \bf 2.42 & \cellcolor{skyblue}{\bf 2.35} & \cellcolor{skyblue}{\bf 19.72\%} & 2.42 & 2.37 & \cellcolor{skyblue}{2.39} & \cellcolor{skyblue}{15.39\%} & \bf 1.78 & \bf 2.65 & \cellcolor{skyblue}{\bf 2.21} & \cellcolor{skyblue}{\bf 19.29\%} \\
\bottomrule
\end{tabular}%
}
\label{exp:human_eval}
\end{table}