scottsuk0306 commited on
Commit
07a2125
β€’
1 Parent(s): 95d589e
app.py CHANGED
@@ -121,7 +121,7 @@ with demo:
121
  select_columns=SelectColumns(
122
  default_selection=ordered_columns,
123
  cant_deselect=["Model πŸ€—", "Model Type", "Model Params (B)"],
124
- label="Select Columns to Display:",
125
  ),
126
  search_columns=["Model πŸ€—"],
127
  # hide_columns=["model_name_for_query", "Model Size"],
 
121
  select_columns=SelectColumns(
122
  default_selection=ordered_columns,
123
  cant_deselect=["Model πŸ€—", "Model Type", "Model Params (B)"],
124
+ label="Select Columns to Display\n(Multilingual is excluded when measuring average)",
125
  ),
126
  search_columns=["Model πŸ€—"],
127
  # hide_columns=["model_name_for_query", "Model Size"],
data/bgb-leaderboard-gpt-4-turbo-2024-04-09.csv CHANGED
@@ -1,104 +1,104 @@
1
  Grounding ⚑️,Instruction Following πŸ“,Planning πŸ“…,Reasoning πŸ’‘,Refinement πŸ”©,Safety ⚠️,Theory of Mind πŸ€”,Tool Usage πŸ› οΈ,Multilingual πŸ‡¬πŸ‡«,Model πŸ€—,Model Params (B),Model Type,Average
2
- 4.288,4.23,4.271,4.22,4.171,4.565,4.24,3.775,3.6,gpt-4-1106-preview,,Proprietary,4.151
3
- 4.3,4.2,4.357,4.16,4.145,4.174,4.26,3.925,3.543,gpt-4-0125-preview,,Proprietary,4.118
4
- 4.238,4.26,4.357,4.21,4.079,4.058,4.08,3.85,3.643,gpt-4o-2024-05-13,,Proprietary,4.086
5
- 4.312,4.13,4.3,4.2,4.105,4.087,4.12,3.8,3.471,gpt-4-turbo-2024-04-09,,Proprietary,4.058
6
- 4.288,4.06,4.186,3.97,3.908,4.536,4.09,3.788,3.571,claude-3-opus-20240229,,Proprietary,4.044
7
- 4.125,4.18,4.186,3.87,3.907,4.014,4.04,3.775,3.314,meta-llama/Meta-Llama-3-70B-Instruct,70.0,Chat,3.935
8
- 4.25,3.92,4.171,3.91,3.724,4.362,4.0,3.75,3.186,claude-3-sonnet-20240229,,Proprietary,3.919
9
- 4.05,4.04,4.129,4.06,3.671,4.116,4.07,3.488,3.257,gemini-pro-1.5,,Proprietary,3.876
10
- 4.138,4.01,4.129,3.69,3.632,4.304,3.98,3.75,3.071,claude-3-haiku-20240307,,Proprietary,3.856
11
- 4.15,4.01,4.229,3.94,3.882,4.043,3.99,3.588,2.771,qwen/qwen-110b-chat,110.0,Chat,3.845
12
- 3.962,3.94,4.029,3.95,3.776,4.058,3.9,3.862,2.929,mistral-medium,,Proprietary,3.823
13
- 4.025,3.99,4.029,3.93,3.776,3.913,3.93,3.825,2.886,mistral-large,,Proprietary,3.812
14
- 4.012,4.0,4.0,3.96,3.842,4.087,3.87,3.712,2.714,MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-AWQ,141,Chat,3.8
15
- 4.138,3.91,3.971,3.92,3.453,4.217,3.96,3.625,2.671,google/gemini-flash-1.5,,Proprietary,3.763
16
- 3.888,3.99,4.029,3.68,3.632,3.957,3.96,3.525,2.914,Qwen/Qwen1.5-72B-Chat,72.0,Chat,3.73
17
- 3.988,4.0,4.186,3.64,3.461,3.971,3.94,3.525,2.757,alpindale/c4ai-command-r-plus-GPTQ,104,Chat,3.719
18
- 3.788,3.85,4.029,3.62,3.395,4.217,3.87,3.738,2.714,Qwen/Qwen1.5-32B-Chat,32.0,Chat,3.691
19
- 4.125,3.94,3.929,3.47,3.507,3.725,3.83,3.5,2.914,meta-llama/Meta-Llama-3-8B-Instruct,8.0,Chat,3.66
20
- 3.725,3.88,3.8,3.81,3.974,4.145,3.9,3.338,1.914,microsoft/Phi-3-mini-4k-instruct,3.8,Chat,3.609
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  3.688,3.7,3.743,3.5,3.539,4.0,3.49,3.188,,mistral-community/Mixtral-8x22B-v0.1-AWQ,141,Base,3.606
22
- 3.812,4.06,3.957,3.53,3.342,3.739,3.79,3.662,2.557,NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO,46.7,Chat,3.606
23
- 3.8,3.84,4.0,3.56,3.547,3.87,3.87,3.562,2.271,Starling-LM-7B-beta,7.0,Chat,3.591
24
- 3.6,3.84,3.871,3.62,3.373,3.942,3.75,3.125,3.186,gemini-1.0-pro,,Proprietary,3.59
25
- 3.9,3.88,3.6,3.71,3.434,3.812,3.81,3.412,2.714,mistralai/Mixtral-8x7B-Instruct-v0.1,46.7,Chat,3.586
26
- 3.925,3.85,3.843,3.65,3.434,3.884,3.79,3.138,2.614,gpt-3.5-turbo-0125,,Proprietary,3.57
27
- 4.025,3.79,3.829,3.51,3.434,4.0,3.67,3.162,2.557,gpt-3.5-turbo-1106,,Proprietary,3.553
28
- 3.812,3.77,3.857,3.42,3.382,3.826,3.9,3.412,2.443,upstage/SOLAR-10.7B-Instruct-v1.0,10.7,Chat,3.536
29
- 3.738,3.83,3.914,3.57,3.676,3.884,3.96,3.038,2.186,01-ai/Yi-34B-Chat,34.0,Chat,3.533
30
- 3.7,3.89,3.9,3.36,3.421,3.754,3.83,3.612,2.314,allenai/tulu-2-dpo-70b,70.0,Chat,3.531
31
- 3.662,3.88,3.929,3.22,3.36,4.377,3.73,3.188,2.386,meta-llama/Llama-2-70b-chat-hf,70.0,Chat,3.526
32
- 3.812,3.88,3.9,3.39,3.447,3.899,3.9,3.188,2.186,CohereForAI/c4ai-command-r-v01,35.0,Chat,3.511
33
- 3.712,3.8,3.7,3.82,3.513,3.957,3.83,3.1,1.829,microsoft/Phi-3-mini-128k-instruct,3.8,Chat,3.473
34
- 3.7,3.87,3.8,3.18,3.447,3.826,3.77,3.362,2.286,mistralai/Mistral-7B-Instruct-v0.2,7.0,Chat,3.471
35
- 3.55,3.62,3.957,3.52,3.618,3.449,3.58,3.288,2.586,MaziyarPanahi/zephyr-orpo-141b-A35b-v0.1-AWQ,141,Chat,3.463
36
- 3.65,3.78,3.714,3.39,3.461,3.609,3.63,3.538,2.4,NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT,46.7,Chat,3.463
37
  3.712,3.58,3.5,3.3,3.237,3.87,3.59,2.775,,mistralai/Mixtral-8x7B-v0.1,46.7,Base,3.445
38
- 3.625,3.9,3.857,3.36,3.263,3.855,3.52,3.2,2.386,Qwen/Qwen1.5-14B-Chat,14.0,Chat,3.441
39
- 3.638,3.84,3.757,3.34,3.566,3.725,3.66,3.125,2.157,openchat/openchat-3.5-0106,7.0,Chat,3.423
40
  3.488,3.6,3.5,3.25,3.227,3.942,3.38,2.988,,Qwen/Qwen1.5-72B,72.0,Base,3.422
41
- 3.712,3.72,3.829,3.33,3.224,3.913,3.54,3.025,2.229,Starling-LM-7B-alpha,7.0,Chat,3.391
42
- 3.588,3.88,3.714,3.3,3.395,3.725,3.7,3.15,2.057,Qwen/Qwen1.5-7B-Chat,7.0,Chat,3.39
43
- 3.662,3.74,3.8,3.26,3.355,3.377,3.69,3.062,2.171,NousResearch/Nous-Hermes-2-Mistral-7B-DPO,7.0,Chat,3.347
44
- 3.55,3.72,3.729,3.23,3.382,3.551,3.73,3.288,1.943,HuggingFaceH4/zephyr-7b-beta,7.0,Chat,3.347
45
  3.512,3.54,3.529,3.27,3.24,3.58,3.39,2.512,,01-ai/Yi-34B,34.0,Base,3.322
46
- 3.338,3.65,3.643,3.53,3.373,3.536,3.56,3.175,2.071,NousResearch/Nous-Hermes-2-Yi-34B,34.0,Chat,3.32
47
- 3.612,3.8,3.686,3.12,3.263,3.696,3.58,3.025,2.1,kaist-ai/mistral-orpo-beta,7.0,Chat,3.32
48
  3.425,3.56,3.386,3.06,3.133,3.87,3.48,2.625,,meta-llama/Llama-2-70b-hf,70.0,Base,3.317
49
- 3.662,3.92,3.686,2.76,3.079,4.319,3.71,2.6,2.114,meta-llama/Llama-2-13b-chat-hf,13.0,Chat,3.317
50
  3.325,3.64,3.514,3.31,3.118,3.333,3.33,2.925,,Qwen/Qwen1.5-32B,32.0,Base,3.312
51
- 3.688,3.66,3.729,3.28,3.276,3.435,3.57,3.062,2.1,teknium/OpenHermes-2.5-Mistral-7B,7.0,Chat,3.311
52
- 3.525,3.7,3.6,3.11,3.171,3.971,3.5,2.95,2.086,kaist-ai/mistral-orpo-alpha,7.0,Chat,3.29
53
- 3.45,3.77,3.6,2.9,3.184,3.841,3.59,3.05,2.143,allenai/tulu-2-dpo-13b,13.0,Chat,3.281
54
- 3.45,3.51,3.686,3.01,3.211,3.652,3.5,3.35,2.0,allenai/codetulu-2-34b,34.0,Chat,3.263
55
- 3.588,3.53,3.371,3.25,3.25,4.043,3.44,2.788,2.0,google/gemma-1.1-7b-it,7.0,Chat,3.251
56
  3.25,3.56,3.371,2.96,3.197,3.667,3.42,2.562,,upstage/SOLAR-10.7B-v1.0,10.7,Base,3.248
57
- 3.525,3.66,3.8,3.28,3.28,3.232,3.45,2.925,1.914,teknium/OpenHermes-2-Mistral-7B,7.0,Chat,3.23
58
- 3.5,3.5,3.457,3.04,3.079,4.13,3.46,2.738,2.114,codellama/CodeLlama-34b-Instruct-hf,34.0,Chat,3.224
59
- 3.388,3.58,3.586,2.85,2.961,4.145,3.65,2.3,2.029,meta-llama/Llama-2-7b-chat-hf,7.0,Chat,3.165
60
- 3.238,3.76,3.5,2.79,3.079,3.754,3.68,2.438,1.971,allenai/tulu-2-dpo-7b,7.0,Chat,3.134
 
61
  3.35,3.33,3.114,3.04,3.342,3.261,3.04,2.5,,meta-llama/Meta-Llama-3-70B,70.0,Base,3.122
62
  3.538,3.41,3.157,3.0,3.092,2.58,3.16,2.912,,Qwen/Qwen1.5-14B,14.0,Base,3.106
63
- 3.225,3.5,3.4,2.8,3.197,3.29,3.38,3.238,1.886,allenai/codetulu-2-13b,13.0,Chat,3.102
64
- 3.15,3.38,3.4,2.8,3.027,3.768,3.39,2.775,2.029,allenai/tulu-2-13b,13.0,Chat,3.08
65
- 3.262,3.34,3.357,2.77,2.895,4.043,3.38,2.6,1.886,codellama/CodeLlama-13b-Instruct-hf,13.0,Chat,3.059
66
  3.15,3.33,3.1,2.78,2.892,3.377,3.29,2.275,,mistral-community/Mistral-7B-v0.2,7.0,Base,3.024
67
- 3.275,3.52,3.414,2.85,3.08,3.478,3.677,2.338,1.457,01-ai/Yi-6B-Chat,6.0,Chat,3.01
68
  3.225,3.3,3.243,2.86,2.763,3.406,3.09,2.162,,mistralai/Mistral-7B-v0.1,7.0,Base,3.006
69
- 3.212,3.36,3.286,2.75,2.961,3.754,3.22,2.575,1.771,codellama/CodeLlama-7b-Instruct-hf,7.0,Chat,2.988
70
- 3.312,3.43,3.071,2.97,3.026,3.768,3.15,2.325,1.786,google/gemma-7b-it,7.0,Chat,2.982
71
- 3.112,3.41,3.114,2.73,2.908,3.246,3.25,2.788,1.8,allenai/codetulu-2-7b,7.0,Chat,2.929
72
- 2.9,3.34,3.229,2.74,3.053,3.971,3.37,1.975,1.471,google/gemma-1.1-2b-it,2.0,Chat,2.894
73
- 2.862,3.34,3.229,2.81,2.974,3.638,3.26,2.212,1.714,allenai/tulu-2-7b,7.0,Chat,2.893
74
  2.988,3.14,3.014,2.65,2.827,3.101,2.77,2.488,,Qwen/Qwen1.5-7B,7.0,Base,2.872
75
  3.138,2.92,2.857,2.8,2.763,3.406,3.2,1.788,,microsoft/phi-2,2.7,Base,2.859
76
- 2.9,3.19,3.086,2.83,3.0,3.333,3.07,2.4,1.471,Qwen/Qwen1.5-4B-Chat,4.0,Chat,2.809
77
- 3.112,3.54,3.271,2.47,2.776,3.101,3.31,2.212,1.414,allenai/OLMo-7B-Instruct,7.0,Chat,2.801
78
- 2.875,3.24,3.114,2.48,2.882,3.754,3.15,1.962,1.657,google/gemma-2b-it,2.0,Chat,2.79
79
  2.988,2.97,2.743,2.75,2.816,2.971,2.84,2.088,,EleutherAI/llemma_34b,34.0,Base,2.771
80
  3.262,2.94,2.657,2.39,3.039,2.899,2.82,1.938,,meta-llama/Meta-Llama-3-8B,8.0,Base,2.743
 
81
  2.888,2.94,2.729,2.45,2.697,3.333,2.73,1.9,,Qwen/Qwen1.5-4B,4.0,Base,2.708
82
- 2.85,2.7,2.671,2.83,2.747,4.101,2.55,1.988,1.929,codellama/CodeLlama-70b-Instruct-hf,70.0,Chat,2.707
83
  2.85,3.09,2.786,2.28,2.579,3.348,2.88,1.812,,meta-llama/Llama-2-13b-hf,13.0,Base,2.703
84
- 2.95,3.27,2.957,2.4,2.684,3.333,2.93,2.088,1.186,allenai/OLMo-7B-SFT,7.0,Chat,2.644
85
  2.938,2.97,2.657,2.36,2.487,3.232,2.89,1.55,,01-ai/Yi-6B,6.0,Base,2.635
86
  2.938,2.62,2.557,2.44,2.507,2.841,2.44,2.4,,codellama/CodeLlama-70b-hf,70.0,Base,2.593
87
- 2.812,3.27,2.914,2.28,2.855,2.681,3.13,1.988,1.3,Qwen/Qwen1.5-1.8B-Chat,1.8,Chat,2.581
88
  2.812,2.66,2.486,2.17,2.566,2.725,2.59,2.062,,codellama/CodeLlama-34b-hf,34.0,Base,2.509
89
  2.475,2.89,2.5,2.24,2.526,2.87,2.95,1.525,,microsoft/phi-1_5,1.3,Base,2.497
 
90
  2.612,2.87,2.514,2.18,2.211,3.217,2.6,1.45,,meta-llama/Llama-2-7b-hf,7.0,Base,2.457
91
- 2.938,2.49,1.786,2.24,2.487,2.812,2.8,2.362,2.043,microsoft/Orca-2-13b,13.0,Chat,2.44
92
  2.538,2.85,2.386,1.98,2.605,2.478,2.55,1.525,,Qwen/Qwen1.5-1.8B,1.8,Base,2.364
93
  2.412,2.57,2.086,2.24,2.303,2.522,2.19,1.838,,EleutherAI/llemma_7b,7.0,Base,2.27
94
  2.338,2.72,2.357,2.16,2.093,2.623,2.32,1.488,,google/gemma-2b,2.0,Base,2.262
95
  2.3,2.3,1.957,2.01,2.092,2.449,2.15,1.812,,codellama/CodeLlama-13b-hf,13.0,Base,2.134
 
 
96
  2.388,2.26,1.929,1.84,2.105,2.652,2.16,1.312,,allenai/OLMo-7B,7.0,Base,2.081
97
- 2.425,2.27,1.371,1.85,2.316,2.594,2.24,1.6,1.729,microsoft/Orca-2-7b,7.0,Chat,2.044
98
- 2.2,2.61,2.057,1.76,2.0,2.391,2.38,1.462,1.159,Qwen/Qwen1.5-0.5B-Chat,0.5,Chat,2.002
99
  1.962,2.25,1.771,1.72,2.118,2.348,1.9,1.562,,codellama/CodeLlama-7b-hf,7.0,Base,1.954
100
  2.025,2.12,1.7,1.58,2.158,2.014,1.8,1.275,,Qwen/Qwen1.5-0.5B,0.5,Base,1.834
101
  1.762,1.8,1.443,1.33,1.947,2.188,1.59,1.125,,allenai/OLMo-1B,1.0,Base,1.648
102
- 1.288,1.45,1.471,1.25,1.908,1.667,1.38,1.162,1.129,CohereForAI/aya-101,13.0,Chat,1.412
103
  1.325,1.49,1.186,1.34,1.579,2.159,1.2,1.012,,google/gemma-7b,7.0,Base,1.411
104
  1.112,1.01,1.0,1.0,1.434,1.507,1.0,1.012,,microsoft/phi-1,1.3,Base,1.135
 
1
  Grounding ⚑️,Instruction Following πŸ“,Planning πŸ“…,Reasoning πŸ’‘,Refinement πŸ”©,Safety ⚠️,Theory of Mind πŸ€”,Tool Usage πŸ› οΈ,Multilingual πŸ‡¬πŸ‡«,Model πŸ€—,Model Params (B),Model Type,Average
2
+ 4.288,4.23,4.271,4.22,4.171,4.565,4.24,3.775,3.6,gpt-4-1106-preview,,Proprietary,4.22
3
+ 4.3,4.2,4.357,4.16,4.145,4.174,4.26,3.925,3.543,gpt-4-0125-preview,,Proprietary,4.19
4
+ 4.238,4.26,4.357,4.21,4.079,4.058,4.08,3.85,3.643,gpt-4o-2024-05-13,,Proprietary,4.141
5
+ 4.312,4.13,4.3,4.2,4.105,4.087,4.12,3.8,3.471,gpt-4-turbo-2024-04-09,,Proprietary,4.132
6
+ 4.288,4.06,4.186,3.97,3.908,4.536,4.09,3.788,3.571,claude-3-opus-20240229,,Proprietary,4.103
7
+ 4.125,4.18,4.186,3.87,3.907,4.014,4.04,3.775,3.314,meta-llama/Meta-Llama-3-70B-Instruct,70.0,Chat,4.012
8
+ 4.25,3.92,4.171,3.91,3.724,4.362,4.0,3.75,3.186,claude-3-sonnet-20240229,,Proprietary,4.011
9
+ 4.15,4.01,4.229,3.94,3.882,4.043,3.99,3.588,2.771,qwen/qwen-110b-chat,110.0,Chat,3.979
10
+ 4.138,4.01,4.129,3.69,3.632,4.304,3.98,3.75,3.071,claude-3-haiku-20240307,,Proprietary,3.954
11
+ 4.05,4.04,4.129,4.06,3.671,4.116,4.07,3.488,3.257,gemini-pro-1.5,,Proprietary,3.953
12
+ 4.012,4.0,4.0,3.96,3.842,4.087,3.87,3.712,2.714,MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-AWQ,141,Chat,3.936
13
+ 3.962,3.94,4.029,3.95,3.776,4.058,3.9,3.862,2.929,mistral-medium,,Proprietary,3.935
14
+ 4.025,3.99,4.029,3.93,3.776,3.913,3.93,3.825,2.886,mistral-large,,Proprietary,3.927
15
+ 4.138,3.91,3.971,3.92,3.453,4.217,3.96,3.625,2.671,google/gemini-flash-1.5,,Proprietary,3.899
16
+ 3.988,4.0,4.186,3.64,3.461,3.971,3.94,3.525,2.757,alpindale/c4ai-command-r-plus-GPTQ,104,Chat,3.839
17
+ 3.888,3.99,4.029,3.68,3.632,3.957,3.96,3.525,2.914,Qwen/Qwen1.5-72B-Chat,72.0,Chat,3.832
18
+ 3.725,3.88,3.8,3.81,3.974,4.145,3.9,3.338,1.914,microsoft/Phi-3-mini-4k-instruct,3.8,Chat,3.821
19
+ 3.788,3.85,4.029,3.62,3.395,4.217,3.87,3.738,2.714,Qwen/Qwen1.5-32B-Chat,32.0,Chat,3.813
20
+ 3.8,3.84,4.0,3.56,3.547,3.87,3.87,3.562,2.271,Starling-LM-7B-beta,7.0,Chat,3.756
21
+ 4.125,3.94,3.929,3.47,3.507,3.725,3.83,3.5,2.914,meta-llama/Meta-Llama-3-8B-Instruct,8.0,Chat,3.753
22
+ 3.812,4.06,3.957,3.53,3.342,3.739,3.79,3.662,2.557,NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO,46.7,Chat,3.737
23
+ 3.738,3.83,3.914,3.57,3.676,3.884,3.96,3.038,2.186,01-ai/Yi-34B-Chat,34.0,Chat,3.701
24
+ 3.9,3.88,3.6,3.71,3.434,3.812,3.81,3.412,2.714,mistralai/Mixtral-8x7B-Instruct-v0.1,46.7,Chat,3.695
25
+ 3.925,3.85,3.843,3.65,3.434,3.884,3.79,3.138,2.614,gpt-3.5-turbo-0125,,Proprietary,3.689
26
+ 3.7,3.89,3.9,3.36,3.421,3.754,3.83,3.612,2.314,allenai/tulu-2-dpo-70b,70.0,Chat,3.683
27
+ 3.712,3.8,3.7,3.82,3.513,3.957,3.83,3.1,1.829,microsoft/Phi-3-mini-128k-instruct,3.8,Chat,3.679
28
+ 4.025,3.79,3.829,3.51,3.434,4.0,3.67,3.162,2.557,gpt-3.5-turbo-1106,,Proprietary,3.678
29
+ 3.812,3.88,3.9,3.39,3.447,3.899,3.9,3.188,2.186,CohereForAI/c4ai-command-r-v01,35.0,Chat,3.677
30
+ 3.812,3.77,3.857,3.42,3.382,3.826,3.9,3.412,2.443,upstage/SOLAR-10.7B-Instruct-v1.0,10.7,Chat,3.672
31
+ 3.662,3.88,3.929,3.22,3.36,4.377,3.73,3.188,2.386,meta-llama/Llama-2-70b-chat-hf,70.0,Chat,3.668
32
+ 3.6,3.84,3.871,3.62,3.373,3.942,3.75,3.125,3.186,gemini-1.0-pro,,Proprietary,3.64
33
+ 3.7,3.87,3.8,3.18,3.447,3.826,3.77,3.362,2.286,mistralai/Mistral-7B-Instruct-v0.2,7.0,Chat,3.619
34
  3.688,3.7,3.743,3.5,3.539,4.0,3.49,3.188,,mistral-community/Mixtral-8x22B-v0.1-AWQ,141,Base,3.606
35
+ 3.65,3.78,3.714,3.39,3.461,3.609,3.63,3.538,2.4,NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT,46.7,Chat,3.596
36
+ 3.638,3.84,3.757,3.34,3.566,3.725,3.66,3.125,2.157,openchat/openchat-3.5-0106,7.0,Chat,3.581
37
+ 3.55,3.62,3.957,3.52,3.618,3.449,3.58,3.288,2.586,MaziyarPanahi/zephyr-orpo-141b-A35b-v0.1-AWQ,141,Chat,3.573
38
+ 3.625,3.9,3.857,3.36,3.263,3.855,3.52,3.2,2.386,Qwen/Qwen1.5-14B-Chat,14.0,Chat,3.573
39
+ 3.588,3.88,3.714,3.3,3.395,3.725,3.7,3.15,2.057,Qwen/Qwen1.5-7B-Chat,7.0,Chat,3.556
40
+ 3.712,3.72,3.829,3.33,3.224,3.913,3.54,3.025,2.229,Starling-LM-7B-alpha,7.0,Chat,3.537
41
+ 3.55,3.72,3.729,3.23,3.382,3.551,3.73,3.288,1.943,HuggingFaceH4/zephyr-7b-beta,7.0,Chat,3.522
42
+ 3.662,3.74,3.8,3.26,3.355,3.377,3.69,3.062,2.171,NousResearch/Nous-Hermes-2-Mistral-7B-DPO,7.0,Chat,3.493
43
+ 3.338,3.65,3.643,3.53,3.373,3.536,3.56,3.175,2.071,NousResearch/Nous-Hermes-2-Yi-34B,34.0,Chat,3.476
44
+ 3.612,3.8,3.686,3.12,3.263,3.696,3.58,3.025,2.1,kaist-ai/mistral-orpo-beta,7.0,Chat,3.473
45
+ 3.662,3.92,3.686,2.76,3.079,4.319,3.71,2.6,2.114,meta-llama/Llama-2-13b-chat-hf,13.0,Chat,3.467
46
+ 3.688,3.66,3.729,3.28,3.276,3.435,3.57,3.062,2.1,teknium/OpenHermes-2.5-Mistral-7B,7.0,Chat,3.462
 
 
 
47
  3.712,3.58,3.5,3.3,3.237,3.87,3.59,2.775,,mistralai/Mixtral-8x7B-v0.1,46.7,Base,3.445
48
+ 3.525,3.7,3.6,3.11,3.171,3.971,3.5,2.95,2.086,kaist-ai/mistral-orpo-alpha,7.0,Chat,3.441
49
+ 3.45,3.77,3.6,2.9,3.184,3.841,3.59,3.05,2.143,allenai/tulu-2-dpo-13b,13.0,Chat,3.423
50
  3.488,3.6,3.5,3.25,3.227,3.942,3.38,2.988,,Qwen/Qwen1.5-72B,72.0,Base,3.422
51
+ 3.45,3.51,3.686,3.01,3.211,3.652,3.5,3.35,2.0,allenai/codetulu-2-34b,34.0,Chat,3.421
52
+ 3.588,3.53,3.371,3.25,3.25,4.043,3.44,2.788,2.0,google/gemma-1.1-7b-it,7.0,Chat,3.407
53
+ 3.525,3.66,3.8,3.28,3.28,3.232,3.45,2.925,1.914,teknium/OpenHermes-2-Mistral-7B,7.0,Chat,3.394
54
+ 3.5,3.5,3.457,3.04,3.079,4.13,3.46,2.738,2.114,codellama/CodeLlama-34b-Instruct-hf,34.0,Chat,3.363
55
  3.512,3.54,3.529,3.27,3.24,3.58,3.39,2.512,,01-ai/Yi-34B,34.0,Base,3.322
 
 
56
  3.425,3.56,3.386,3.06,3.133,3.87,3.48,2.625,,meta-llama/Llama-2-70b-hf,70.0,Base,3.317
 
57
  3.325,3.64,3.514,3.31,3.118,3.333,3.33,2.925,,Qwen/Qwen1.5-32B,32.0,Base,3.312
58
+ 3.388,3.58,3.586,2.85,2.961,4.145,3.65,2.3,2.029,meta-llama/Llama-2-7b-chat-hf,7.0,Chat,3.307
59
+ 3.238,3.76,3.5,2.79,3.079,3.754,3.68,2.438,1.971,allenai/tulu-2-dpo-7b,7.0,Chat,3.28
60
+ 3.225,3.5,3.4,2.8,3.197,3.29,3.38,3.238,1.886,allenai/codetulu-2-13b,13.0,Chat,3.254
 
 
61
  3.25,3.56,3.371,2.96,3.197,3.667,3.42,2.562,,upstage/SOLAR-10.7B-v1.0,10.7,Base,3.248
62
+ 3.15,3.38,3.4,2.8,3.027,3.768,3.39,2.775,2.029,allenai/tulu-2-13b,13.0,Chat,3.211
63
+ 3.262,3.34,3.357,2.77,2.895,4.043,3.38,2.6,1.886,codellama/CodeLlama-13b-Instruct-hf,13.0,Chat,3.206
64
+ 3.275,3.52,3.414,2.85,3.08,3.478,3.677,2.338,1.457,01-ai/Yi-6B-Chat,6.0,Chat,3.204
65
+ 3.212,3.36,3.286,2.75,2.961,3.754,3.22,2.575,1.771,codellama/CodeLlama-7b-Instruct-hf,7.0,Chat,3.14
66
+ 3.312,3.43,3.071,2.97,3.026,3.768,3.15,2.325,1.786,google/gemma-7b-it,7.0,Chat,3.132
67
  3.35,3.33,3.114,3.04,3.342,3.261,3.04,2.5,,meta-llama/Meta-Llama-3-70B,70.0,Base,3.122
68
  3.538,3.41,3.157,3.0,3.092,2.58,3.16,2.912,,Qwen/Qwen1.5-14B,14.0,Base,3.106
69
+ 2.9,3.34,3.229,2.74,3.053,3.971,3.37,1.975,1.471,google/gemma-1.1-2b-it,2.0,Chat,3.072
70
+ 3.112,3.41,3.114,2.73,2.908,3.246,3.25,2.788,1.8,allenai/codetulu-2-7b,7.0,Chat,3.07
71
+ 2.862,3.34,3.229,2.81,2.974,3.638,3.26,2.212,1.714,allenai/tulu-2-7b,7.0,Chat,3.041
72
  3.15,3.33,3.1,2.78,2.892,3.377,3.29,2.275,,mistral-community/Mistral-7B-v0.2,7.0,Base,3.024
 
73
  3.225,3.3,3.243,2.86,2.763,3.406,3.09,2.162,,mistralai/Mistral-7B-v0.1,7.0,Base,3.006
74
+ 2.9,3.19,3.086,2.83,3.0,3.333,3.07,2.4,1.471,Qwen/Qwen1.5-4B-Chat,4.0,Chat,2.976
75
+ 3.112,3.54,3.271,2.47,2.776,3.101,3.31,2.212,1.414,allenai/OLMo-7B-Instruct,7.0,Chat,2.974
76
+ 2.875,3.24,3.114,2.48,2.882,3.754,3.15,1.962,1.657,google/gemma-2b-it,2.0,Chat,2.932
 
 
77
  2.988,3.14,3.014,2.65,2.827,3.101,2.77,2.488,,Qwen/Qwen1.5-7B,7.0,Base,2.872
78
  3.138,2.92,2.857,2.8,2.763,3.406,3.2,1.788,,microsoft/phi-2,2.7,Base,2.859
79
+ 2.95,3.27,2.957,2.4,2.684,3.333,2.93,2.088,1.186,allenai/OLMo-7B-SFT,7.0,Chat,2.827
80
+ 2.85,2.7,2.671,2.83,2.747,4.101,2.55,1.988,1.929,codellama/CodeLlama-70b-Instruct-hf,70.0,Chat,2.805
 
81
  2.988,2.97,2.743,2.75,2.816,2.971,2.84,2.088,,EleutherAI/llemma_34b,34.0,Base,2.771
82
  3.262,2.94,2.657,2.39,3.039,2.899,2.82,1.938,,meta-llama/Meta-Llama-3-8B,8.0,Base,2.743
83
+ 2.812,3.27,2.914,2.28,2.855,2.681,3.13,1.988,1.3,Qwen/Qwen1.5-1.8B-Chat,1.8,Chat,2.741
84
  2.888,2.94,2.729,2.45,2.697,3.333,2.73,1.9,,Qwen/Qwen1.5-4B,4.0,Base,2.708
 
85
  2.85,3.09,2.786,2.28,2.579,3.348,2.88,1.812,,meta-llama/Llama-2-13b-hf,13.0,Base,2.703
 
86
  2.938,2.97,2.657,2.36,2.487,3.232,2.89,1.55,,01-ai/Yi-6B,6.0,Base,2.635
87
  2.938,2.62,2.557,2.44,2.507,2.841,2.44,2.4,,codellama/CodeLlama-70b-hf,70.0,Base,2.593
 
88
  2.812,2.66,2.486,2.17,2.566,2.725,2.59,2.062,,codellama/CodeLlama-34b-hf,34.0,Base,2.509
89
  2.475,2.89,2.5,2.24,2.526,2.87,2.95,1.525,,microsoft/phi-1_5,1.3,Base,2.497
90
+ 2.938,2.49,1.786,2.24,2.487,2.812,2.8,2.362,2.043,microsoft/Orca-2-13b,13.0,Chat,2.489
91
  2.612,2.87,2.514,2.18,2.211,3.217,2.6,1.45,,meta-llama/Llama-2-7b-hf,7.0,Base,2.457
 
92
  2.538,2.85,2.386,1.98,2.605,2.478,2.55,1.525,,Qwen/Qwen1.5-1.8B,1.8,Base,2.364
93
  2.412,2.57,2.086,2.24,2.303,2.522,2.19,1.838,,EleutherAI/llemma_7b,7.0,Base,2.27
94
  2.338,2.72,2.357,2.16,2.093,2.623,2.32,1.488,,google/gemma-2b,2.0,Base,2.262
95
  2.3,2.3,1.957,2.01,2.092,2.449,2.15,1.812,,codellama/CodeLlama-13b-hf,13.0,Base,2.134
96
+ 2.2,2.61,2.057,1.76,2.0,2.391,2.38,1.462,1.159,Qwen/Qwen1.5-0.5B-Chat,0.5,Chat,2.108
97
+ 2.425,2.27,1.371,1.85,2.316,2.594,2.24,1.6,1.729,microsoft/Orca-2-7b,7.0,Chat,2.083
98
  2.388,2.26,1.929,1.84,2.105,2.652,2.16,1.312,,allenai/OLMo-7B,7.0,Base,2.081
 
 
99
  1.962,2.25,1.771,1.72,2.118,2.348,1.9,1.562,,codellama/CodeLlama-7b-hf,7.0,Base,1.954
100
  2.025,2.12,1.7,1.58,2.158,2.014,1.8,1.275,,Qwen/Qwen1.5-0.5B,0.5,Base,1.834
101
  1.762,1.8,1.443,1.33,1.947,2.188,1.59,1.125,,allenai/OLMo-1B,1.0,Base,1.648
102
+ 1.288,1.45,1.471,1.25,1.908,1.667,1.38,1.162,1.129,CohereForAI/aya-101,13.0,Chat,1.447
103
  1.325,1.49,1.186,1.34,1.579,2.159,1.2,1.012,,google/gemma-7b,7.0,Base,1.411
104
  1.112,1.01,1.0,1.0,1.434,1.507,1.0,1.012,,microsoft/phi-1,1.3,Base,1.135
data/bgb-leaderboard-gpt-4-turbo-2024-04-09.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c03b7873825ad92dee16b8ad4dc5a15d558208b50a4af1e3c8b390c8a1f789b
3
  size 13905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a0184e00330d082c67abb29a01b3b1def3cd2c4525d6516dbc2eb1749b3baa4
3
  size 13905
data/bgb-leaderboard-prometheus-bgb-8x7b-v2.0.csv CHANGED
@@ -1,104 +1,104 @@
1
  Grounding ⚑️,Instruction Following πŸ“,Planning πŸ“…,Reasoning πŸ’‘,Refinement πŸ”©,Safety ⚠️,Theory of Mind πŸ€”,Tool Usage πŸ› οΈ,Multilingual πŸ‡¬πŸ‡«,Model πŸ€—,Model Params (B),Model Type,Average
2
- 4.012,4.21,4.029,4.01,4.034,4.449,4.09,3.6,3.429,gpt-4-1106-preview,,Proprietary,3.985
3
- 4.175,4.14,4.1,3.98,3.789,4.235,4.06,3.788,3.414,gpt-4o-2024-05-13,,Proprietary,3.965
4
- 4.112,4.13,3.929,4.15,4.0,4.145,4.15,3.725,3.329,gpt-4-0125-preview,,Proprietary,3.963
5
- 4.112,4.09,3.986,3.92,3.862,4.116,4.06,3.688,3.357,gpt-4-turbo-2024-04-09,,Proprietary,3.91
6
- 4.075,3.88,4.157,3.8,3.741,4.435,4.05,3.425,3.357,claude-3-opus-20240229,,Proprietary,3.88
7
- 4.175,3.92,3.971,3.76,3.741,4.029,3.97,3.625,3.114,meta-llama/Meta-Llama-3-70B-Instruct,70.0,Chat,3.812
8
- 4.075,4.03,4.0,3.83,3.776,4.13,3.96,3.325,2.771,qwen/qwen-110b-chat,110.0,Chat,3.766
9
- 3.862,3.83,3.943,3.84,3.69,4.29,3.86,3.5,3.043,claude-3-sonnet-20240229,,Proprietary,3.762
10
- 3.925,3.91,3.843,3.82,3.552,4.116,3.91,3.688,2.971,mistral-medium,,Proprietary,3.748
11
- 4.0,3.94,3.957,3.58,3.569,4.275,3.93,3.538,2.871,claude-3-haiku-20240307,,Proprietary,3.74
12
- 3.875,3.88,3.871,3.83,3.5,4.145,4.01,3.288,3.1,gemini-pro-1.5,,Proprietary,3.722
13
- 3.9,3.83,3.757,3.66,3.638,3.957,3.94,3.712,2.871,mistral-large,,Proprietary,3.696
14
- 4.05,3.81,3.743,3.81,3.31,4.145,3.97,3.45,2.729,google/gemini-flash-1.5,,Proprietary,3.669
15
- 3.925,4.02,3.857,3.46,3.517,3.928,3.91,3.425,2.829,alpindale/c4ai-command-r-plus-GPTQ,104,Chat,3.652
16
- 3.812,3.96,3.771,3.6,3.379,4.043,3.84,3.45,2.757,MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-AWQ,141,Chat,3.624
17
- 3.712,3.92,3.771,3.53,3.586,4.101,3.92,3.425,2.629,Qwen/Qwen1.5-72B-Chat,72.0,Chat,3.622
18
- 3.85,3.75,3.814,3.3,3.345,3.928,3.71,3.362,3.043,meta-llama/Meta-Llama-3-8B-Instruct,8.0,Chat,3.567
19
- 3.775,3.86,3.8,3.44,3.534,3.986,3.91,3.325,2.429,Starling-LM-7B-beta,7.0,Chat,3.562
20
- 3.65,3.85,3.643,3.55,3.121,4.246,3.8,3.488,2.671,Qwen/Qwen1.5-32B-Chat,32.0,Chat,3.558
21
- 3.9,3.85,3.486,3.54,3.776,4.232,3.81,3.062,1.971,microsoft/Phi-3-mini-4k-instruct,3.8,Chat,3.514
22
- 3.65,3.89,3.571,3.45,3.138,4.014,3.78,3.2,2.743,mistralai/Mixtral-8x7B-Instruct-v0.1,46.7,Chat,3.493
23
- 3.8,3.86,3.757,3.43,3.259,3.957,3.64,2.988,2.586,gpt-3.5-turbo-0125,,Proprietary,3.475
24
- 3.812,3.75,3.714,3.41,3.241,4.087,3.65,3.0,2.586,gpt-3.5-turbo-1106,,Proprietary,3.472
25
- 3.562,3.65,3.629,3.48,3.069,3.884,3.74,3.062,2.986,gemini-1.0-pro,,Proprietary,3.451
26
- 3.638,3.8,3.8,3.17,3.155,3.826,3.7,3.5,2.4,allenai/tulu-2-dpo-70b,70.0,Chat,3.443
27
- 3.7,3.8,3.586,3.21,3.034,3.826,3.7,3.488,2.586,upstage/SOLAR-10.7B-Instruct-v1.0,10.7,Chat,3.437
28
- 3.662,3.84,3.671,3.24,3.155,3.783,3.71,3.338,2.529,NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO,46.7,Chat,3.436
 
 
 
 
 
 
 
 
 
29
  3.525,3.59,3.5,3.44,3.207,3.942,3.37,2.762,,mistral-community/Mixtral-8x22B-v0.1-AWQ,141,Base,3.417
30
- 3.612,3.72,3.657,2.98,3.155,4.464,3.79,2.888,2.429,meta-llama/Llama-2-70b-chat-hf,70.0,Chat,3.411
31
- 3.462,3.74,3.714,3.27,3.414,4.087,3.81,2.812,2.014,01-ai/Yi-34B-Chat,34.0,Chat,3.369
32
- 3.588,3.77,3.614,3.26,3.121,3.884,3.5,3.062,2.486,Qwen/Qwen1.5-14B-Chat,14.0,Chat,3.365
33
- 3.688,3.74,3.6,3.01,3.103,3.957,3.49,3.012,2.6,mistralai/Mistral-7B-Instruct-v0.2,7.0,Chat,3.356
34
- 3.712,3.72,3.643,3.14,3.19,4.014,3.88,2.95,1.957,CohereForAI/c4ai-command-r-v01,35.0,Chat,3.356
35
- 3.688,3.69,3.629,3.16,3.103,3.652,3.59,3.225,2.414,NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT,46.7,Chat,3.35
36
- 3.588,3.66,3.471,3.66,3.345,3.942,3.7,2.912,1.814,microsoft/Phi-3-mini-128k-instruct,3.8,Chat,3.344
37
- 3.525,3.76,3.514,3.26,3.31,3.841,3.61,2.888,2.314,openchat/openchat-3.5-0106,7.0,Chat,3.336
38
- 3.288,3.62,3.686,3.25,3.345,3.551,3.45,3.062,2.543,MaziyarPanahi/zephyr-orpo-141b-A35b-v0.1-AWQ,141,Chat,3.31
39
- 3.712,3.74,3.5,3.2,2.948,3.942,3.53,2.838,2.129,Starling-LM-7B-alpha,7.0,Chat,3.282
40
- 3.4,3.74,3.4,3.04,3.0,3.754,3.71,2.975,2.043,Qwen/Qwen1.5-7B-Chat,7.0,Chat,3.229
41
- 3.588,3.7,3.343,2.71,2.862,4.319,3.66,2.512,2.343,meta-llama/Llama-2-13b-chat-hf,13.0,Chat,3.226
 
42
  3.55,3.45,3.186,3.14,2.759,3.812,3.33,2.538,,mistralai/Mixtral-8x7B-v0.1,46.7,Base,3.22
43
- 3.462,3.66,3.429,2.97,2.931,3.899,3.54,2.812,2.129,kaist-ai/mistral-orpo-beta,7.0,Chat,3.204
44
  3.375,3.41,3.114,2.97,2.914,3.899,3.17,2.762,,Qwen/Qwen1.5-72B,72.0,Base,3.202
45
- 3.438,3.58,3.629,3.05,3.172,3.319,3.46,2.925,2.214,NousResearch/Nous-Hermes-2-Mistral-7B-DPO,7.0,Chat,3.199
46
- 3.388,3.56,3.443,2.86,3.103,4.029,3.45,2.825,2.114,kaist-ai/mistral-orpo-alpha,7.0,Chat,3.197
47
- 3.575,3.53,3.557,3.07,3.172,3.304,3.42,2.875,2.243,teknium/OpenHermes-2.5-Mistral-7B,7.0,Chat,3.194
48
- 3.488,3.56,3.314,3.12,3.052,4.072,3.44,2.675,2.029,google/gemma-1.1-7b-it,7.0,Chat,3.194
49
- 3.2,3.63,3.557,3.24,3.207,3.609,3.55,2.85,1.9,NousResearch/Nous-Hermes-2-Yi-34B,34.0,Chat,3.194
50
- 3.412,3.58,3.457,2.71,3.034,3.884,3.55,2.775,2.229,allenai/tulu-2-dpo-13b,13.0,Chat,3.181
51
- 3.388,3.4,3.414,3.01,3.138,3.725,3.43,3.075,2.014,allenai/codetulu-2-34b,34.0,Chat,3.177
52
- 3.375,3.56,3.5,3.0,2.897,3.522,3.5,3.05,1.957,HuggingFaceH4/zephyr-7b-beta,7.0,Chat,3.151
53
  3.488,3.37,3.186,3.05,2.879,3.681,3.21,2.162,,01-ai/Yi-34B,34.0,Base,3.128
54
  3.288,3.49,3.1,2.78,2.759,3.855,3.17,2.45,,meta-llama/Llama-2-70b-hf,70.0,Base,3.111
55
  3.125,3.52,3.143,2.99,2.81,3.536,3.07,2.638,,Qwen/Qwen1.5-32B,32.0,Base,3.104
56
- 3.438,3.62,3.371,2.64,2.741,4.261,3.58,2.175,2.086,meta-llama/Llama-2-7b-chat-hf,7.0,Chat,3.101
57
- 3.35,3.39,3.286,2.85,2.724,4.101,3.37,2.5,2.186,codellama/CodeLlama-34b-Instruct-hf,34.0,Chat,3.084
58
- 3.25,3.55,3.643,2.89,2.845,3.493,3.32,2.638,1.971,teknium/OpenHermes-2-Mistral-7B,7.0,Chat,3.067
59
  3.088,3.37,3.114,2.75,2.759,3.565,3.25,2.225,,upstage/SOLAR-10.7B-v1.0,10.7,Base,3.015
60
- 3.25,3.67,3.243,2.68,2.707,3.768,3.51,2.325,1.986,allenai/tulu-2-dpo-7b,7.0,Chat,3.015
61
- 3.012,3.31,3.271,2.68,2.707,3.841,3.2,2.325,2.057,allenai/tulu-2-13b,13.0,Chat,2.934
62
- 3.088,3.37,3.057,2.62,2.793,3.42,3.22,2.988,1.8,allenai/codetulu-2-13b,13.0,Chat,2.928
 
 
63
  3.388,3.3,2.914,2.72,2.862,2.623,3.06,2.55,,Qwen/Qwen1.5-14B,14.0,Base,2.927
64
- 3.038,3.2,3.157,2.59,2.483,3.971,3.21,2.312,2.157,codellama/CodeLlama-13b-Instruct-hf,13.0,Chat,2.902
65
  3.25,3.22,2.786,2.76,2.69,3.261,2.92,2.312,,meta-llama/Meta-Llama-3-70B,70.0,Base,2.9
66
- 3.15,3.34,2.814,2.91,2.828,3.652,3.17,2.2,1.657,google/gemma-7b-it,7.0,Chat,2.858
67
- 3.138,3.18,3.029,2.58,2.586,3.826,3.19,2.212,1.7,codellama/CodeLlama-7b-Instruct-hf,7.0,Chat,2.827
 
68
  2.938,3.23,2.914,2.68,2.466,3.406,2.9,1.975,,mistralai/Mistral-7B-v0.1,7.0,Base,2.814
69
- 2.912,3.29,3.029,2.55,2.707,4.13,3.25,1.675,1.657,google/gemma-1.1-2b-it,2.0,Chat,2.8
70
- 3.0,3.45,3.129,2.49,2.603,3.507,3.56,1.888,1.529,01-ai/Yi-6B-Chat,6.0,Chat,2.795
71
  3.025,3.24,2.786,2.58,2.483,3.203,3.07,1.862,,mistral-community/Mistral-7B-v0.2,7.0,Base,2.781
72
- 2.8,3.18,3.0,2.49,2.724,3.348,3.12,2.525,1.829,allenai/codetulu-2-7b,7.0,Chat,2.78
73
- 2.85,3.21,3.1,2.56,2.517,3.681,3.12,2.0,1.729,allenai/tulu-2-7b,7.0,Chat,2.752
74
  2.962,2.75,2.714,2.69,2.569,3.435,2.98,1.65,,microsoft/phi-2,2.7,Base,2.719
75
- 2.8,3.09,2.971,2.36,2.638,4.043,3.12,1.75,1.686,google/gemma-2b-it,2.0,Chat,2.718
 
76
  2.938,3.0,2.843,2.37,2.414,3.072,2.58,2.175,,Qwen/Qwen1.5-7B,7.0,Base,2.674
77
- 2.8,3.1,2.871,2.53,2.862,3.348,3.0,1.938,1.471,Qwen/Qwen1.5-4B-Chat,4.0,Chat,2.658
78
- 2.95,3.44,2.971,2.33,2.414,3.072,3.19,1.988,1.4,allenai/OLMo-7B-Instruct,7.0,Chat,2.639
79
- 2.925,2.51,2.386,2.62,2.448,4.217,2.56,1.738,1.757,codellama/CodeLlama-70b-Instruct-hf,70.0,Chat,2.573
80
- 2.862,3.13,2.886,2.33,2.259,3.507,2.95,1.725,1.229,allenai/OLMo-7B-SFT,7.0,Chat,2.542
81
  2.838,2.8,2.5,2.53,2.276,2.884,2.61,1.775,,EleutherAI/llemma_34b,34.0,Base,2.527
82
  2.762,3.01,2.6,2.15,2.138,3.217,2.65,1.512,,meta-llama/Llama-2-13b-hf,13.0,Base,2.505
83
  2.788,2.89,2.443,2.23,2.155,3.275,2.51,1.675,,Qwen/Qwen1.5-4B,4.0,Base,2.496
84
  2.975,2.81,2.314,2.27,2.362,2.913,2.64,1.65,,meta-llama/Meta-Llama-3-8B,8.0,Base,2.492
85
- 2.85,3.11,2.643,2.24,2.517,2.725,3.11,1.662,1.329,Qwen/Qwen1.5-1.8B-Chat,1.8,Chat,2.465
86
  2.775,2.76,2.557,2.3,2.052,3.043,2.74,1.412,,01-ai/Yi-6B,6.0,Base,2.455
 
87
  2.75,2.42,2.329,2.32,1.966,2.696,2.23,2.025,,codellama/CodeLlama-70b-hf,70.0,Base,2.342
88
  2.45,2.84,2.257,2.12,2.172,2.913,2.62,1.275,,microsoft/phi-1_5,1.3,Base,2.331
89
- 2.888,2.47,1.629,2.13,2.017,2.826,2.8,2.05,1.971,microsoft/Orca-2-13b,13.0,Chat,2.309
90
  2.462,2.87,2.257,2.05,1.793,3.159,2.4,1.262,,meta-llama/Llama-2-7b-hf,7.0,Base,2.282
91
  2.675,2.41,2.129,1.98,2.069,2.594,2.45,1.8,,codellama/CodeLlama-34b-hf,34.0,Base,2.263
92
  2.425,2.7,2.229,1.81,2.086,2.449,2.38,1.35,,Qwen/Qwen1.5-1.8B,1.8,Base,2.179
93
  2.25,2.65,2.086,1.94,1.862,2.638,2.31,1.288,,google/gemma-2b,2.0,Base,2.128
94
  2.238,2.46,1.829,1.97,1.897,2.522,2.03,1.612,,EleutherAI/llemma_7b,7.0,Base,2.07
95
- 2.288,2.26,1.314,1.72,1.81,2.623,2.25,1.338,1.843,microsoft/Orca-2-7b,7.0,Chat,1.938
 
96
  2.125,2.19,1.743,1.76,1.828,2.667,2.02,1.15,,allenai/OLMo-7B,7.0,Base,1.935
97
- 2.075,2.44,1.914,1.64,1.69,2.42,2.26,1.25,1.186,Qwen/Qwen1.5-0.5B-Chat,0.5,Chat,1.875
98
  2.1,2.06,1.757,1.71,1.621,2.275,1.89,1.588,,codellama/CodeLlama-13b-hf,13.0,Base,1.875
99
  1.75,2.05,1.471,1.59,1.534,2.261,1.79,1.375,,codellama/CodeLlama-7b-hf,7.0,Base,1.728
100
  1.925,2.04,1.6,1.51,1.5,1.957,1.72,1.188,,Qwen/Qwen1.5-0.5B,0.5,Base,1.68
101
  1.675,1.64,1.357,1.31,1.31,2.087,1.44,1.062,,allenai/OLMo-1B,1.0,Base,1.485
102
- 1.25,1.4,1.357,1.34,1.362,1.667,1.4,1.15,1.157,CohereForAI/aya-101,13.0,Chat,1.343
103
  1.375,1.46,1.214,1.22,1.034,1.928,1.19,1.012,,google/gemma-7b,7.0,Base,1.304
104
  1.038,1.01,1.0,1.0,1.017,1.377,1.0,1.012,,microsoft/phi-1,1.3,Base,1.057
 
1
  Grounding ⚑️,Instruction Following πŸ“,Planning πŸ“…,Reasoning πŸ’‘,Refinement πŸ”©,Safety ⚠️,Theory of Mind πŸ€”,Tool Usage πŸ› οΈ,Multilingual πŸ‡¬πŸ‡«,Model πŸ€—,Model Params (B),Model Type,Average
2
+ 4.012,4.21,4.029,4.01,4.034,4.449,4.09,3.6,3.429,gpt-4-1106-preview,,Proprietary,4.054
3
+ 4.112,4.13,3.929,4.15,4.0,4.145,4.15,3.725,3.329,gpt-4-0125-preview,,Proprietary,4.043
4
+ 4.175,4.14,4.1,3.98,3.789,4.235,4.06,3.788,3.414,gpt-4o-2024-05-13,,Proprietary,4.033
5
+ 4.112,4.09,3.986,3.92,3.862,4.116,4.06,3.688,3.357,gpt-4-turbo-2024-04-09,,Proprietary,3.979
6
+ 4.075,3.88,4.157,3.8,3.741,4.435,4.05,3.425,3.357,claude-3-opus-20240229,,Proprietary,3.945
7
+ 4.175,3.92,3.971,3.76,3.741,4.029,3.97,3.625,3.114,meta-llama/Meta-Llama-3-70B-Instruct,70.0,Chat,3.899
8
+ 4.075,4.03,4.0,3.83,3.776,4.13,3.96,3.325,2.771,qwen/qwen-110b-chat,110.0,Chat,3.891
9
+ 3.862,3.83,3.943,3.84,3.69,4.29,3.86,3.5,3.043,claude-3-sonnet-20240229,,Proprietary,3.852
10
+ 4.0,3.94,3.957,3.58,3.569,4.275,3.93,3.538,2.871,claude-3-haiku-20240307,,Proprietary,3.849
11
+ 3.925,3.91,3.843,3.82,3.552,4.116,3.91,3.688,2.971,mistral-medium,,Proprietary,3.845
12
+ 3.875,3.88,3.871,3.83,3.5,4.145,4.01,3.288,3.1,gemini-pro-1.5,,Proprietary,3.8
13
+ 3.9,3.83,3.757,3.66,3.638,3.957,3.94,3.712,2.871,mistral-large,,Proprietary,3.799
14
+ 4.05,3.81,3.743,3.81,3.31,4.145,3.97,3.45,2.729,google/gemini-flash-1.5,,Proprietary,3.786
15
+ 3.925,4.02,3.857,3.46,3.517,3.928,3.91,3.425,2.829,alpindale/c4ai-command-r-plus-GPTQ,104,Chat,3.755
16
+ 3.712,3.92,3.771,3.53,3.586,4.101,3.92,3.425,2.629,Qwen/Qwen1.5-72B-Chat,72.0,Chat,3.746
17
+ 3.812,3.96,3.771,3.6,3.379,4.043,3.84,3.45,2.757,MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-AWQ,141,Chat,3.732
18
+ 3.9,3.85,3.486,3.54,3.776,4.232,3.81,3.062,1.971,microsoft/Phi-3-mini-4k-instruct,3.8,Chat,3.707
19
+ 3.775,3.86,3.8,3.44,3.534,3.986,3.91,3.325,2.429,Starling-LM-7B-beta,7.0,Chat,3.704
20
+ 3.65,3.85,3.643,3.55,3.121,4.246,3.8,3.488,2.671,Qwen/Qwen1.5-32B-Chat,32.0,Chat,3.668
21
+ 3.85,3.75,3.814,3.3,3.345,3.928,3.71,3.362,3.043,meta-llama/Meta-Llama-3-8B-Instruct,8.0,Chat,3.632
22
+ 3.65,3.89,3.571,3.45,3.138,4.014,3.78,3.2,2.743,mistralai/Mixtral-8x7B-Instruct-v0.1,46.7,Chat,3.587
23
+ 3.8,3.86,3.757,3.43,3.259,3.957,3.64,2.988,2.586,gpt-3.5-turbo-0125,,Proprietary,3.586
24
+ 3.812,3.75,3.714,3.41,3.241,4.087,3.65,3.0,2.586,gpt-3.5-turbo-1106,,Proprietary,3.583
25
+ 3.638,3.8,3.8,3.17,3.155,3.826,3.7,3.5,2.4,allenai/tulu-2-dpo-70b,70.0,Chat,3.574
26
+ 3.662,3.84,3.671,3.24,3.155,3.783,3.71,3.338,2.529,NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO,46.7,Chat,3.55
27
+ 3.7,3.8,3.586,3.21,3.034,3.826,3.7,3.488,2.586,upstage/SOLAR-10.7B-Instruct-v1.0,10.7,Chat,3.543
28
+ 3.462,3.74,3.714,3.27,3.414,4.087,3.81,2.812,2.014,01-ai/Yi-34B-Chat,34.0,Chat,3.539
29
+ 3.588,3.66,3.471,3.66,3.345,3.942,3.7,2.912,1.814,microsoft/Phi-3-mini-128k-instruct,3.8,Chat,3.535
30
+ 3.612,3.72,3.657,2.98,3.155,4.464,3.79,2.888,2.429,meta-llama/Llama-2-70b-chat-hf,70.0,Chat,3.533
31
+ 3.712,3.72,3.643,3.14,3.19,4.014,3.88,2.95,1.957,CohereForAI/c4ai-command-r-v01,35.0,Chat,3.531
32
+ 3.562,3.65,3.629,3.48,3.069,3.884,3.74,3.062,2.986,gemini-1.0-pro,,Proprietary,3.51
33
+ 3.588,3.77,3.614,3.26,3.121,3.884,3.5,3.062,2.486,Qwen/Qwen1.5-14B-Chat,14.0,Chat,3.475
34
+ 3.688,3.69,3.629,3.16,3.103,3.652,3.59,3.225,2.414,NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT,46.7,Chat,3.467
35
+ 3.525,3.76,3.514,3.26,3.31,3.841,3.61,2.888,2.314,openchat/openchat-3.5-0106,7.0,Chat,3.463
36
+ 3.688,3.74,3.6,3.01,3.103,3.957,3.49,3.012,2.6,mistralai/Mistral-7B-Instruct-v0.2,7.0,Chat,3.45
37
+ 3.712,3.74,3.5,3.2,2.948,3.942,3.53,2.838,2.129,Starling-LM-7B-alpha,7.0,Chat,3.426
38
  3.525,3.59,3.5,3.44,3.207,3.942,3.37,2.762,,mistral-community/Mixtral-8x22B-v0.1-AWQ,141,Base,3.417
39
+ 3.288,3.62,3.686,3.25,3.345,3.551,3.45,3.062,2.543,MaziyarPanahi/zephyr-orpo-141b-A35b-v0.1-AWQ,141,Chat,3.406
40
+ 3.4,3.74,3.4,3.04,3.0,3.754,3.71,2.975,2.043,Qwen/Qwen1.5-7B-Chat,7.0,Chat,3.377
41
+ 3.2,3.63,3.557,3.24,3.207,3.609,3.55,2.85,1.9,NousResearch/Nous-Hermes-2-Yi-34B,34.0,Chat,3.355
42
+ 3.488,3.56,3.314,3.12,3.052,4.072,3.44,2.675,2.029,google/gemma-1.1-7b-it,7.0,Chat,3.34
43
+ 3.462,3.66,3.429,2.97,2.931,3.899,3.54,2.812,2.129,kaist-ai/mistral-orpo-beta,7.0,Chat,3.338
44
+ 3.588,3.7,3.343,2.71,2.862,4.319,3.66,2.512,2.343,meta-llama/Llama-2-13b-chat-hf,13.0,Chat,3.337
45
+ 3.388,3.56,3.443,2.86,3.103,4.029,3.45,2.825,2.114,kaist-ai/mistral-orpo-alpha,7.0,Chat,3.332
46
+ 3.438,3.58,3.629,3.05,3.172,3.319,3.46,2.925,2.214,NousResearch/Nous-Hermes-2-Mistral-7B-DPO,7.0,Chat,3.322
47
+ 3.388,3.4,3.414,3.01,3.138,3.725,3.43,3.075,2.014,allenai/codetulu-2-34b,34.0,Chat,3.322
48
+ 3.575,3.53,3.557,3.07,3.172,3.304,3.42,2.875,2.243,teknium/OpenHermes-2.5-Mistral-7B,7.0,Chat,3.313
49
+ 3.412,3.58,3.457,2.71,3.034,3.884,3.55,2.775,2.229,allenai/tulu-2-dpo-13b,13.0,Chat,3.3
50
+ 3.375,3.56,3.5,3.0,2.897,3.522,3.5,3.05,1.957,HuggingFaceH4/zephyr-7b-beta,7.0,Chat,3.3
51
+ 3.438,3.62,3.371,2.64,2.741,4.261,3.58,2.175,2.086,meta-llama/Llama-2-7b-chat-hf,7.0,Chat,3.228
52
  3.55,3.45,3.186,3.14,2.759,3.812,3.33,2.538,,mistralai/Mixtral-8x7B-v0.1,46.7,Base,3.22
53
+ 3.25,3.55,3.643,2.89,2.845,3.493,3.32,2.638,1.971,teknium/OpenHermes-2-Mistral-7B,7.0,Chat,3.203
54
  3.375,3.41,3.114,2.97,2.914,3.899,3.17,2.762,,Qwen/Qwen1.5-72B,72.0,Base,3.202
55
+ 3.35,3.39,3.286,2.85,2.724,4.101,3.37,2.5,2.186,codellama/CodeLlama-34b-Instruct-hf,34.0,Chat,3.196
56
+ 3.25,3.67,3.243,2.68,2.707,3.768,3.51,2.325,1.986,allenai/tulu-2-dpo-7b,7.0,Chat,3.144
 
 
 
 
 
 
57
  3.488,3.37,3.186,3.05,2.879,3.681,3.21,2.162,,01-ai/Yi-34B,34.0,Base,3.128
58
  3.288,3.49,3.1,2.78,2.759,3.855,3.17,2.45,,meta-llama/Llama-2-70b-hf,70.0,Base,3.111
59
  3.125,3.52,3.143,2.99,2.81,3.536,3.07,2.638,,Qwen/Qwen1.5-32B,32.0,Base,3.104
60
+ 3.088,3.37,3.057,2.62,2.793,3.42,3.22,2.988,1.8,allenai/codetulu-2-13b,13.0,Chat,3.069
61
+ 3.012,3.31,3.271,2.68,2.707,3.841,3.2,2.325,2.057,allenai/tulu-2-13b,13.0,Chat,3.043
 
62
  3.088,3.37,3.114,2.75,2.759,3.565,3.25,2.225,,upstage/SOLAR-10.7B-v1.0,10.7,Base,3.015
63
+ 3.15,3.34,2.814,2.91,2.828,3.652,3.17,2.2,1.657,google/gemma-7b-it,7.0,Chat,3.008
64
+ 3.038,3.2,3.157,2.59,2.483,3.971,3.21,2.312,2.157,codellama/CodeLlama-13b-Instruct-hf,13.0,Chat,2.995
65
+ 3.138,3.18,3.029,2.58,2.586,3.826,3.19,2.212,1.7,codellama/CodeLlama-7b-Instruct-hf,7.0,Chat,2.968
66
+ 3.0,3.45,3.129,2.49,2.603,3.507,3.56,1.888,1.529,01-ai/Yi-6B-Chat,6.0,Chat,2.953
67
+ 2.912,3.29,3.029,2.55,2.707,4.13,3.25,1.675,1.657,google/gemma-1.1-2b-it,2.0,Chat,2.943
68
  3.388,3.3,2.914,2.72,2.862,2.623,3.06,2.55,,Qwen/Qwen1.5-14B,14.0,Base,2.927
 
69
  3.25,3.22,2.786,2.76,2.69,3.261,2.92,2.312,,meta-llama/Meta-Llama-3-70B,70.0,Base,2.9
70
+ 2.8,3.18,3.0,2.49,2.724,3.348,3.12,2.525,1.829,allenai/codetulu-2-7b,7.0,Chat,2.898
71
+ 2.85,3.21,3.1,2.56,2.517,3.681,3.12,2.0,1.729,allenai/tulu-2-7b,7.0,Chat,2.88
72
+ 2.8,3.09,2.971,2.36,2.638,4.043,3.12,1.75,1.686,google/gemma-2b-it,2.0,Chat,2.847
73
  2.938,3.23,2.914,2.68,2.466,3.406,2.9,1.975,,mistralai/Mistral-7B-v0.1,7.0,Base,2.814
74
+ 2.8,3.1,2.871,2.53,2.862,3.348,3.0,1.938,1.471,Qwen/Qwen1.5-4B-Chat,4.0,Chat,2.806
75
+ 2.95,3.44,2.971,2.33,2.414,3.072,3.19,1.988,1.4,allenai/OLMo-7B-Instruct,7.0,Chat,2.794
76
  3.025,3.24,2.786,2.58,2.483,3.203,3.07,1.862,,mistral-community/Mistral-7B-v0.2,7.0,Base,2.781
 
 
77
  2.962,2.75,2.714,2.69,2.569,3.435,2.98,1.65,,microsoft/phi-2,2.7,Base,2.719
78
+ 2.862,3.13,2.886,2.33,2.259,3.507,2.95,1.725,1.229,allenai/OLMo-7B-SFT,7.0,Chat,2.706
79
+ 2.925,2.51,2.386,2.62,2.448,4.217,2.56,1.738,1.757,codellama/CodeLlama-70b-Instruct-hf,70.0,Chat,2.675
80
  2.938,3.0,2.843,2.37,2.414,3.072,2.58,2.175,,Qwen/Qwen1.5-7B,7.0,Base,2.674
81
+ 2.85,3.11,2.643,2.24,2.517,2.725,3.11,1.662,1.329,Qwen/Qwen1.5-1.8B-Chat,1.8,Chat,2.607
 
 
 
82
  2.838,2.8,2.5,2.53,2.276,2.884,2.61,1.775,,EleutherAI/llemma_34b,34.0,Base,2.527
83
  2.762,3.01,2.6,2.15,2.138,3.217,2.65,1.512,,meta-llama/Llama-2-13b-hf,13.0,Base,2.505
84
  2.788,2.89,2.443,2.23,2.155,3.275,2.51,1.675,,Qwen/Qwen1.5-4B,4.0,Base,2.496
85
  2.975,2.81,2.314,2.27,2.362,2.913,2.64,1.65,,meta-llama/Meta-Llama-3-8B,8.0,Base,2.492
 
86
  2.775,2.76,2.557,2.3,2.052,3.043,2.74,1.412,,01-ai/Yi-6B,6.0,Base,2.455
87
+ 2.888,2.47,1.629,2.13,2.017,2.826,2.8,2.05,1.971,microsoft/Orca-2-13b,13.0,Chat,2.351
88
  2.75,2.42,2.329,2.32,1.966,2.696,2.23,2.025,,codellama/CodeLlama-70b-hf,70.0,Base,2.342
89
  2.45,2.84,2.257,2.12,2.172,2.913,2.62,1.275,,microsoft/phi-1_5,1.3,Base,2.331
 
90
  2.462,2.87,2.257,2.05,1.793,3.159,2.4,1.262,,meta-llama/Llama-2-7b-hf,7.0,Base,2.282
91
  2.675,2.41,2.129,1.98,2.069,2.594,2.45,1.8,,codellama/CodeLlama-34b-hf,34.0,Base,2.263
92
  2.425,2.7,2.229,1.81,2.086,2.449,2.38,1.35,,Qwen/Qwen1.5-1.8B,1.8,Base,2.179
93
  2.25,2.65,2.086,1.94,1.862,2.638,2.31,1.288,,google/gemma-2b,2.0,Base,2.128
94
  2.238,2.46,1.829,1.97,1.897,2.522,2.03,1.612,,EleutherAI/llemma_7b,7.0,Base,2.07
95
+ 2.075,2.44,1.914,1.64,1.69,2.42,2.26,1.25,1.186,Qwen/Qwen1.5-0.5B-Chat,0.5,Chat,1.961
96
+ 2.288,2.26,1.314,1.72,1.81,2.623,2.25,1.338,1.843,microsoft/Orca-2-7b,7.0,Chat,1.95
97
  2.125,2.19,1.743,1.76,1.828,2.667,2.02,1.15,,allenai/OLMo-7B,7.0,Base,1.935
 
98
  2.1,2.06,1.757,1.71,1.621,2.275,1.89,1.588,,codellama/CodeLlama-13b-hf,13.0,Base,1.875
99
  1.75,2.05,1.471,1.59,1.534,2.261,1.79,1.375,,codellama/CodeLlama-7b-hf,7.0,Base,1.728
100
  1.925,2.04,1.6,1.51,1.5,1.957,1.72,1.188,,Qwen/Qwen1.5-0.5B,0.5,Base,1.68
101
  1.675,1.64,1.357,1.31,1.31,2.087,1.44,1.062,,allenai/OLMo-1B,1.0,Base,1.485
102
+ 1.25,1.4,1.357,1.34,1.362,1.667,1.4,1.15,1.157,CohereForAI/aya-101,13.0,Chat,1.366
103
  1.375,1.46,1.214,1.22,1.034,1.928,1.19,1.012,,google/gemma-7b,7.0,Base,1.304
104
  1.038,1.01,1.0,1.0,1.017,1.377,1.0,1.012,,microsoft/phi-1,1.3,Base,1.057
data/bgb-leaderboard-prometheus-bgb-8x7b-v2.0.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:54f03922163c1c21e91e3c3e11d29b02c7218be0b5547c6461e0a1d49f8bd68c
3
  size 13905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5dbb80ce656870e6a890fdcb9c11384fe4dbd2b7a72b7683092ad0958566f1a
3
  size 13905
src/llm_perf.py CHANGED
@@ -181,7 +181,7 @@ def get_eval_df(eval_model_name: str):
181
  ]
182
 
183
  # Make the average of the capabilities
184
- eval_df["average"] = eval_df[capabilities].mean(axis=1)
185
 
186
  # Round to 3 decimal places for capabilities and average
187
  eval_df = eval_df.round(
 
181
  ]
182
 
183
  # Make the average of the capabilities
184
+ eval_df["average"] = eval_df[capabilities[:-1]].mean(axis=1)
185
 
186
  # Round to 3 decimal places for capabilities and average
187
  eval_df = eval_df.round(