Spaces:
Running
Running
update
Browse files
app.py
CHANGED
@@ -17,29 +17,17 @@ basic_component_values = [None] * 6
|
|
17 |
leader_component_values = [None] * 5
|
18 |
|
19 |
|
20 |
-
def make_default_md(
|
21 |
-
total_votes = sum(arena_df["num_battles"]) // 2
|
22 |
-
total_models = len(arena_df)
|
23 |
-
|
24 |
leaderboard_md = f"""
|
25 |
-
# π
|
26 |
-
| [
|
27 |
-
|
28 |
-
LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
|
29 |
-
We've collected over **200,000** human preference votes to rank LLMs with the Elo ranking system.
|
30 |
"""
|
31 |
return leaderboard_md
|
32 |
|
33 |
|
34 |
-
def make_arena_leaderboard_md(
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
leaderboard_md = f"""
|
39 |
-
Total #models: **{total_models}**. Total #votes: **{total_votes}**. Last updated: Feb 15, 2024.
|
40 |
-
|
41 |
-
Contribute your vote π³οΈ at [chat.lmsys.org](https://chat.lmsys.org)! Find more analysis in the [notebook]({notebook_url}).
|
42 |
-
"""
|
43 |
return leaderboard_md
|
44 |
|
45 |
|
@@ -201,171 +189,45 @@ def get_full_table(arena_df, model_table_df):
|
|
201 |
values.append(row)
|
202 |
values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
|
203 |
return values
|
|
|
204 |
|
205 |
-
|
206 |
-
|
207 |
-
# sort by rating
|
208 |
-
arena_df = arena_df.sort_values(by=["rating"], ascending=False)
|
209 |
-
values = []
|
210 |
-
for i in range(len(arena_df)):
|
211 |
-
row = []
|
212 |
-
model_key = arena_df.index[i]
|
213 |
-
model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[
|
214 |
-
0
|
215 |
-
]
|
216 |
-
|
217 |
-
# rank
|
218 |
-
row.append(i + 1)
|
219 |
-
# model display name
|
220 |
-
row.append(model_name)
|
221 |
-
# elo rating
|
222 |
-
row.append(round(arena_df.iloc[i]["rating"]))
|
223 |
-
upper_diff = round(
|
224 |
-
arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]
|
225 |
-
)
|
226 |
-
lower_diff = round(
|
227 |
-
arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]
|
228 |
-
)
|
229 |
-
row.append(f"+{upper_diff}/-{lower_diff}")
|
230 |
-
# num battles
|
231 |
-
row.append(round(arena_df.iloc[i]["num_battles"]))
|
232 |
-
# Organization
|
233 |
-
row.append(
|
234 |
-
model_table_df[model_table_df["key"] == model_key]["Organization"].values[0]
|
235 |
-
)
|
236 |
-
# license
|
237 |
-
row.append(
|
238 |
-
model_table_df[model_table_df["key"] == model_key]["License"].values[0]
|
239 |
-
)
|
240 |
-
|
241 |
-
cutoff_date = model_table_df[model_table_df["key"] == model_key]["Knowledge cutoff date"].values[0]
|
242 |
-
if cutoff_date == "-":
|
243 |
-
row.append("Unknown")
|
244 |
-
else:
|
245 |
-
row.append(cutoff_date)
|
246 |
-
values.append(row)
|
247 |
-
return values
|
248 |
-
|
249 |
-
def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
|
250 |
-
if elo_results_file is None: # Do live update
|
251 |
-
default_md = "Loading ..."
|
252 |
-
p1 = p2 = p3 = p4 = None
|
253 |
-
else:
|
254 |
-
with open(elo_results_file, "rb") as fin:
|
255 |
-
elo_results = pickle.load(fin)
|
256 |
-
|
257 |
-
p1 = elo_results["win_fraction_heatmap"]
|
258 |
-
p2 = elo_results["battle_count_heatmap"]
|
259 |
-
p3 = elo_results["bootstrap_elo_rating"]
|
260 |
-
p4 = elo_results["average_win_rate_bar"]
|
261 |
-
arena_df = elo_results["leaderboard_table_df"]
|
262 |
-
default_md = make_default_md(arena_df, elo_results)
|
263 |
-
|
264 |
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
elem_id="arena_leaderboard_dataframe",
|
298 |
-
height=700,
|
299 |
-
column_widths=[50, 200, 120, 100, 100, 150, 150, 100],
|
300 |
-
wrap=True,
|
301 |
-
)
|
302 |
-
with gr.Tab("Full Leaderboard", id=1):
|
303 |
-
md = make_full_leaderboard_md(elo_results)
|
304 |
-
gr.Markdown(md, elem_id="leaderboard_markdown")
|
305 |
-
full_table_vals = get_full_table(arena_df, model_table_df)
|
306 |
-
gr.Dataframe(
|
307 |
-
headers=[
|
308 |
-
"π€ Model",
|
309 |
-
"β Arena Elo",
|
310 |
-
"π MT-bench",
|
311 |
-
"π MMLU",
|
312 |
-
"Organization",
|
313 |
-
"License",
|
314 |
-
],
|
315 |
-
datatype=["markdown", "number", "number", "number", "str", "str"],
|
316 |
-
value=full_table_vals,
|
317 |
-
elem_id="full_leaderboard_dataframe",
|
318 |
-
column_widths=[200, 100, 100, 100, 150, 150],
|
319 |
-
height=700,
|
320 |
-
wrap=True,
|
321 |
-
)
|
322 |
-
if not show_plot:
|
323 |
-
gr.Markdown(
|
324 |
-
""" ## Visit our [HF space](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) for more analysis!
|
325 |
-
If you want to see more models, please help us [add them](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model).
|
326 |
-
""",
|
327 |
-
elem_id="leaderboard_markdown",
|
328 |
)
|
329 |
-
|
330 |
-
pass
|
331 |
-
|
332 |
-
leader_component_values[:] = [default_md, p1, p2, p3, p4]
|
333 |
-
|
334 |
-
if show_plot:
|
335 |
-
gr.Markdown(
|
336 |
-
f"""## More Statistics for Chatbot Arena\n
|
337 |
-
Below are figures for more statistics. The code for generating them is also included in this [notebook]({notebook_url}).
|
338 |
-
You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
|
339 |
-
""",
|
340 |
-
elem_id="leaderboard_markdown"
|
341 |
-
)
|
342 |
-
with gr.Row():
|
343 |
-
with gr.Column():
|
344 |
-
gr.Markdown(
|
345 |
-
"#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles"
|
346 |
-
)
|
347 |
-
plot_1 = gr.Plot(p1, show_label=False)
|
348 |
-
with gr.Column():
|
349 |
-
gr.Markdown(
|
350 |
-
"#### Figure 2: Battle Count for Each Combination of Models (without Ties)"
|
351 |
-
)
|
352 |
-
plot_2 = gr.Plot(p2, show_label=False)
|
353 |
-
with gr.Row():
|
354 |
-
with gr.Column():
|
355 |
-
gr.Markdown(
|
356 |
-
"#### Figure 3: Bootstrap of Elo Estimates (1000 Rounds of Random Sampling)"
|
357 |
-
)
|
358 |
-
plot_3 = gr.Plot(p3, show_label=False)
|
359 |
-
with gr.Column():
|
360 |
-
gr.Markdown(
|
361 |
-
"#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
|
362 |
-
)
|
363 |
-
plot_4 = gr.Plot(p4, show_label=False)
|
364 |
-
|
365 |
-
gr.Markdown(acknowledgment_md)
|
366 |
-
|
367 |
-
if show_plot:
|
368 |
-
return [md_1, plot_1, plot_2, plot_3, plot_4]
|
369 |
return [md_1]
|
370 |
|
371 |
block_css = """
|
@@ -418,32 +280,26 @@ acknowledgment_md = """
|
|
418 |
</div>
|
419 |
"""
|
420 |
|
421 |
-
def build_demo(
|
422 |
text_size = gr.themes.sizes.text_lg
|
423 |
|
424 |
with gr.Blocks(
|
425 |
-
title="
|
426 |
theme=gr.themes.Base(text_size=text_size),
|
427 |
css=block_css,
|
428 |
) as demo:
|
429 |
leader_components = build_leaderboard_tab(
|
430 |
-
|
431 |
)
|
432 |
return demo
|
433 |
|
434 |
|
435 |
if __name__ == "__main__":
|
436 |
-
parser = argparse.ArgumentParser()
|
437 |
-
parser.add_argument("--share", action="store_true")
|
438 |
-
args = parser.parse_args()
|
439 |
-
|
440 |
elo_result_files = glob.glob("elo_results_*.pkl")
|
441 |
-
# elo_result_files.sort(key=lambda x: int(x[12:-4]))
|
442 |
-
# elo_result_file = elo_result_files[-1]
|
443 |
|
444 |
leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
|
445 |
# leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
|
446 |
# leaderboard_table_file = leaderboard_table_files[-1]
|
447 |
|
448 |
demo = build_demo(None, None)
|
449 |
-
demo.launch(share=
|
|
|
17 |
leader_component_values = [None] * 5
|
18 |
|
19 |
|
20 |
+
def make_default_md():
|
|
|
|
|
|
|
21 |
leaderboard_md = f"""
|
22 |
+
# π BabilongLeaderboard
|
23 |
+
| [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) |
|
|
|
|
|
|
|
24 |
"""
|
25 |
return leaderboard_md
|
26 |
|
27 |
|
28 |
+
def make_arena_leaderboard_md():
|
29 |
+
total_models = 'UNK'
|
30 |
+
leaderboard_md = f"""Total #models: **{total_models}**. Last updated: Feb 28, 2024."""
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
return leaderboard_md
|
32 |
|
33 |
|
|
|
189 |
values.append(row)
|
190 |
values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
|
191 |
return values
|
192 |
+
|
193 |
|
194 |
+
def build_leaderboard_tab():
|
195 |
+
default_md = make_default_md()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
|
197 |
+
|
198 |
+
with gr.Tabs() as tabs:
|
199 |
+
# arena table
|
200 |
+
with gr.Tab("Arena Elo", id=0):
|
201 |
+
md = make_arena_leaderboard_md()
|
202 |
+
gr.Markdown(md, elem_id="leaderboard_markdown")
|
203 |
+
gr.Dataframe(
|
204 |
+
headers=[
|
205 |
+
"Rank",
|
206 |
+
"π€ Model",
|
207 |
+
"β Arena Elo",
|
208 |
+
"π 95% CI",
|
209 |
+
"π³οΈ Votes",
|
210 |
+
"Organization",
|
211 |
+
"License",
|
212 |
+
"Knowledge Cutoff",
|
213 |
+
],
|
214 |
+
datatype=[
|
215 |
+
"str",
|
216 |
+
"markdown",
|
217 |
+
"number",
|
218 |
+
"str",
|
219 |
+
"number",
|
220 |
+
"str",
|
221 |
+
"str",
|
222 |
+
"str",
|
223 |
+
],
|
224 |
+
# value=arena_table_vals,
|
225 |
+
elem_id="arena_leaderboard_dataframe",
|
226 |
+
height=700,
|
227 |
+
column_widths=[50, 200, 120, 100, 100, 150, 150, 100],
|
228 |
+
wrap=True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
)
|
230 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
return [md_1]
|
232 |
|
233 |
block_css = """
|
|
|
280 |
</div>
|
281 |
"""
|
282 |
|
283 |
+
def build_demo(leaderboard_table_file):
|
284 |
text_size = gr.themes.sizes.text_lg
|
285 |
|
286 |
with gr.Blocks(
|
287 |
+
title="Babilong leaderboard",
|
288 |
theme=gr.themes.Base(text_size=text_size),
|
289 |
css=block_css,
|
290 |
) as demo:
|
291 |
leader_components = build_leaderboard_tab(
|
292 |
+
leaderboard_table_file, show_plot=True
|
293 |
)
|
294 |
return demo
|
295 |
|
296 |
|
297 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
298 |
elo_result_files = glob.glob("elo_results_*.pkl")
|
|
|
|
|
299 |
|
300 |
leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
|
301 |
# leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
|
302 |
# leaderboard_table_file = leaderboard_table_files[-1]
|
303 |
|
304 |
demo = build_demo(None, None)
|
305 |
+
demo.launch(share=True)
|