File size: 4,329 Bytes
b650828
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
\begin{table}[t]
    \centering
    \small
    \caption{Performance comparison of multimodal judges w.r.t. different ranges of numerical scale and likert range. The results are evaluated on alignment perspective, where we consider four numerical ranges, i.e. [0, 1], [0, 5], [0, 10], [0, 100]. The best performance across all models is bolded.}
    \resizebox{0.7\linewidth}{!}{%
    \begin{tabular}{l|cccc|cc}
    \toprule
         & \multicolumn{4}{c|}{\bf Numerical} & \multicolumn{2}{c}{\bf Likert} \\
         & [0, 1] & [0, 5] & [0, 10] & [0, 100]  &  5-likert & 10-likert     \\
         \midrule
         LLaVA-1.5-7b$^\heartsuit$           & $15.0$  & $26.7$ & $22.0$ & $18.3$ & $ 5.3$ & $10.3$  \\
         LLaVA-1.5-13b$^\heartsuit$          & $ 9.7$  & $12.0$ & $10.3$ & $20.5$ & $ 2.6$ & $ 6.8$ \\
         LLaVA-NeXT-mistral-7b$^\heartsuit$  & $20.8$  & $27.1$ & $31.3$ & $29.3$ & $36.0$ & $38.6$ \\
         LLaVA-NeXT-vicuna-13b$^\heartsuit$  & $18.3$  & $26.7$ & $29.1$ & $17.2$ & $28.7$ & $17.2$  \\
         Instructblip-7b$^\heartsuit$        & $15.0$  & $20.9$ & $17.1$ & $17.6$ & $11.9$ & $16.8$ \\
         MiniGPT4-v2$^\heartsuit$            & $20.4$  & $28.9$ & $32.8$ & $20.9$ & $16.0$ & $28.7$ \\
         Prometheus-Vision-7b$^\heartsuit$   & $3.8 $  & $16.7$ & $18.4$ & $15.7$ & $28.7$ & $31.3$ \\
         Prometheus-Vision-13b$^\heartsuit$  & $19.7$  & $11.5$ & $11.8$ & $11.2$ & $11.0$ & $6.9$  \\
         \midrule
         Qwen-VL-Chat$^\spadesuit$           & $26.7$  & $34.6$  & $31.1$ & $26.9$ & $55.5$ & $30.6$  \\
         Internvl-chat-v1-5$^\spadesuit$     & $33.0$  & $27.6$  & $75.8$ & $35.3$ & $73.3$ & $18.9$  \\
         Idefics2-8b$^\spadesuit$            & $14.6$  & $16.6$  & $32.6$ & $32.6$ & $41.2$ & $25.6$  \\
         \midrule
         GPT-4-vision$^\clubsuit$            & $63.2$  & $61.2$  & $66.1$ & \bf 67.2 & $\textbf{60.2}$ & $\textbf{63.0}$  \\
         GPT-4o$^\clubsuit$                  & \bf 63.9  & $61.3$  & $61.5$ & $62.8$ & $56.3$ & $60.3$  \\
         Gemini Ultra$^\clubsuit$            & $59.3$  & $\textbf{67.3}$  & \bf 67.2 & $60.1$ & $51.4$ & $57.8$  \\
         Claude 3 Opus$^\clubsuit$           & $60.7$  & $45.5$  & $57.1$ & $49.4$ & $56.1$ & $62.4$  \\
         \midrule
         \cellcolor{skyblue} Overall & \cellcolor{skyblue}30.3  & \cellcolor{skyblue}32.3 & \cellcolor{skyblue} 37.6 & \cellcolor{skyblue}32.33 & \cellcolor{skyblue}35.6 & \cellcolor{skyblue}31.7 \\
    \bottomrule
    \end{tabular}
    \label{exp:scale_study}
    }
    \vspace{-1em}
\end{table}

% \begin{table}[t]
%     \centering
%     \caption{Performance comparison of these multimodal judges w.r.t. different ranges of numerical scale. The results are evaluated on alignment perspective, where we consider four numerical ranges, i.e. [0, 1], [0, 5], [0, 10], and [0, 100]. The best performance across all models is bolded.}
%     \resizebox{0.7\linewidth}{!}{%
%     \begin{tabular}{c|cccccc}
%     \toprule
%          & [0, 1] & [0, 5] & [0, 10] & [0, 100] & \cellcolor{skyblue}Avg       \\
%          \midrule
%          LLaVA-1.5-7b$^\heartsuit$ & - &  - & - & - & \cellcolor{skyblue} \\
%          LLaVA-1.5-13b$^\heartsuit$ &  - & - & - & - & \cellcolor{skyblue} \\
%          LLaVA-NeXT-mistral-7b$^\heartsuit$ &  - & - & - & - & \cellcolor{skyblue}  \\
%          LLaVA-NeXT-vicuna-13b$^\heartsuit$ &  - & - & - & - & \cellcolor{skyblue} \\
%          Instructblip-7b$^\heartsuit$ & - &  - & - & - & \cellcolor{skyblue} -  \\
%          MiniGPT4-v2$^\heartsuit$ & - &  - & - & - & \cellcolor{skyblue} -  \\
%          Qwen-VL-Chat$^\spadesuit$ &  - & - & - & - & \cellcolor{skyblue} -  \\
%          Internvl-chat-v1-5$^\spadesuit$ &  - & - & - & - & \cellcolor{skyblue} - \\
%          Idefics2-8b$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\
%          Prometheus-Vision-13b$^\spadesuit$ & -  & - & - & - & \cellcolor{skyblue} - \\
%          \midrule
%          GPT-4-vision$^\clubsuit$ & -  & - & - & - & \cellcolor{skyblue} - \\
%          GPT-4o$^\clubsuit$ & - & -  & - & - & \cellcolor{skyblue}  - \\
%          Gemini Ultra$^\clubsuit$ & -  & - & - & - & \cellcolor{skyblue} - \\
%          Claude 3 Opus$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
%     \bottomrule
%     \end{tabular}}
%     \label{exp:scale_study}
% \end{table}