dsorokin commited on
Commit
43b5eac
β€’
1 Parent(s): 16aecb3
Files changed (1) hide show
  1. app.py +46 -190
app.py CHANGED
@@ -17,29 +17,17 @@ basic_component_values = [None] * 6
17
  leader_component_values = [None] * 5
18
 
19
 
20
- def make_default_md(arena_df, elo_results):
21
- total_votes = sum(arena_df["num_battles"]) // 2
22
- total_models = len(arena_df)
23
-
24
  leaderboard_md = f"""
25
- # πŸ† LMSYS Chatbot Arena Leaderboard
26
- | [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
27
-
28
- LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
29
- We've collected over **200,000** human preference votes to rank LLMs with the Elo ranking system.
30
  """
31
  return leaderboard_md
32
 
33
 
34
- def make_arena_leaderboard_md(arena_df):
35
- total_votes = sum(arena_df["num_battles"]) // 2
36
- total_models = len(arena_df)
37
-
38
- leaderboard_md = f"""
39
- Total #models: **{total_models}**. Total #votes: **{total_votes}**. Last updated: Feb 15, 2024.
40
-
41
- Contribute your vote πŸ—³οΈ at [chat.lmsys.org](https://chat.lmsys.org)! Find more analysis in the [notebook]({notebook_url}).
42
- """
43
  return leaderboard_md
44
 
45
 
@@ -201,171 +189,45 @@ def get_full_table(arena_df, model_table_df):
201
  values.append(row)
202
  values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
203
  return values
 
204
 
205
-
206
- def get_arena_table(arena_df, model_table_df):
207
- # sort by rating
208
- arena_df = arena_df.sort_values(by=["rating"], ascending=False)
209
- values = []
210
- for i in range(len(arena_df)):
211
- row = []
212
- model_key = arena_df.index[i]
213
- model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[
214
- 0
215
- ]
216
-
217
- # rank
218
- row.append(i + 1)
219
- # model display name
220
- row.append(model_name)
221
- # elo rating
222
- row.append(round(arena_df.iloc[i]["rating"]))
223
- upper_diff = round(
224
- arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]
225
- )
226
- lower_diff = round(
227
- arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]
228
- )
229
- row.append(f"+{upper_diff}/-{lower_diff}")
230
- # num battles
231
- row.append(round(arena_df.iloc[i]["num_battles"]))
232
- # Organization
233
- row.append(
234
- model_table_df[model_table_df["key"] == model_key]["Organization"].values[0]
235
- )
236
- # license
237
- row.append(
238
- model_table_df[model_table_df["key"] == model_key]["License"].values[0]
239
- )
240
-
241
- cutoff_date = model_table_df[model_table_df["key"] == model_key]["Knowledge cutoff date"].values[0]
242
- if cutoff_date == "-":
243
- row.append("Unknown")
244
- else:
245
- row.append(cutoff_date)
246
- values.append(row)
247
- return values
248
-
249
- def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
250
- if elo_results_file is None: # Do live update
251
- default_md = "Loading ..."
252
- p1 = p2 = p3 = p4 = None
253
- else:
254
- with open(elo_results_file, "rb") as fin:
255
- elo_results = pickle.load(fin)
256
-
257
- p1 = elo_results["win_fraction_heatmap"]
258
- p2 = elo_results["battle_count_heatmap"]
259
- p3 = elo_results["bootstrap_elo_rating"]
260
- p4 = elo_results["average_win_rate_bar"]
261
- arena_df = elo_results["leaderboard_table_df"]
262
- default_md = make_default_md(arena_df, elo_results)
263
-
264
  md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
265
- if leaderboard_table_file:
266
- data = load_leaderboard_table_csv(leaderboard_table_file)
267
- model_table_df = pd.DataFrame(data)
268
-
269
- with gr.Tabs() as tabs:
270
- # arena table
271
- arena_table_vals = get_arena_table(arena_df, model_table_df)
272
- with gr.Tab("Arena Elo", id=0):
273
- md = make_arena_leaderboard_md(arena_df)
274
- gr.Markdown(md, elem_id="leaderboard_markdown")
275
- gr.Dataframe(
276
- headers=[
277
- "Rank",
278
- "πŸ€– Model",
279
- "⭐ Arena Elo",
280
- "πŸ“Š 95% CI",
281
- "πŸ—³οΈ Votes",
282
- "Organization",
283
- "License",
284
- "Knowledge Cutoff",
285
- ],
286
- datatype=[
287
- "str",
288
- "markdown",
289
- "number",
290
- "str",
291
- "number",
292
- "str",
293
- "str",
294
- "str",
295
- ],
296
- value=arena_table_vals,
297
- elem_id="arena_leaderboard_dataframe",
298
- height=700,
299
- column_widths=[50, 200, 120, 100, 100, 150, 150, 100],
300
- wrap=True,
301
- )
302
- with gr.Tab("Full Leaderboard", id=1):
303
- md = make_full_leaderboard_md(elo_results)
304
- gr.Markdown(md, elem_id="leaderboard_markdown")
305
- full_table_vals = get_full_table(arena_df, model_table_df)
306
- gr.Dataframe(
307
- headers=[
308
- "πŸ€– Model",
309
- "⭐ Arena Elo",
310
- "πŸ“ˆ MT-bench",
311
- "πŸ“š MMLU",
312
- "Organization",
313
- "License",
314
- ],
315
- datatype=["markdown", "number", "number", "number", "str", "str"],
316
- value=full_table_vals,
317
- elem_id="full_leaderboard_dataframe",
318
- column_widths=[200, 100, 100, 100, 150, 150],
319
- height=700,
320
- wrap=True,
321
- )
322
- if not show_plot:
323
- gr.Markdown(
324
- """ ## Visit our [HF space](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) for more analysis!
325
- If you want to see more models, please help us [add them](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model).
326
- """,
327
- elem_id="leaderboard_markdown",
328
  )
329
- else:
330
- pass
331
-
332
- leader_component_values[:] = [default_md, p1, p2, p3, p4]
333
-
334
- if show_plot:
335
- gr.Markdown(
336
- f"""## More Statistics for Chatbot Arena\n
337
- Below are figures for more statistics. The code for generating them is also included in this [notebook]({notebook_url}).
338
- You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
339
- """,
340
- elem_id="leaderboard_markdown"
341
- )
342
- with gr.Row():
343
- with gr.Column():
344
- gr.Markdown(
345
- "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles"
346
- )
347
- plot_1 = gr.Plot(p1, show_label=False)
348
- with gr.Column():
349
- gr.Markdown(
350
- "#### Figure 2: Battle Count for Each Combination of Models (without Ties)"
351
- )
352
- plot_2 = gr.Plot(p2, show_label=False)
353
- with gr.Row():
354
- with gr.Column():
355
- gr.Markdown(
356
- "#### Figure 3: Bootstrap of Elo Estimates (1000 Rounds of Random Sampling)"
357
- )
358
- plot_3 = gr.Plot(p3, show_label=False)
359
- with gr.Column():
360
- gr.Markdown(
361
- "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
362
- )
363
- plot_4 = gr.Plot(p4, show_label=False)
364
-
365
- gr.Markdown(acknowledgment_md)
366
-
367
- if show_plot:
368
- return [md_1, plot_1, plot_2, plot_3, plot_4]
369
  return [md_1]
370
 
371
  block_css = """
@@ -418,32 +280,26 @@ acknowledgment_md = """
418
  </div>
419
  """
420
 
421
- def build_demo(elo_results_file, leaderboard_table_file):
422
  text_size = gr.themes.sizes.text_lg
423
 
424
  with gr.Blocks(
425
- title="Chatbot Arena Leaderboard",
426
  theme=gr.themes.Base(text_size=text_size),
427
  css=block_css,
428
  ) as demo:
429
  leader_components = build_leaderboard_tab(
430
- elo_results_file, leaderboard_table_file, show_plot=True
431
  )
432
  return demo
433
 
434
 
435
  if __name__ == "__main__":
436
- parser = argparse.ArgumentParser()
437
- parser.add_argument("--share", action="store_true")
438
- args = parser.parse_args()
439
-
440
  elo_result_files = glob.glob("elo_results_*.pkl")
441
- # elo_result_files.sort(key=lambda x: int(x[12:-4]))
442
- # elo_result_file = elo_result_files[-1]
443
 
444
  leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
445
  # leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
446
  # leaderboard_table_file = leaderboard_table_files[-1]
447
 
448
  demo = build_demo(None, None)
449
- demo.launch(share=args.share)
 
17
  leader_component_values = [None] * 5
18
 
19
 
20
+ def make_default_md():
 
 
 
21
  leaderboard_md = f"""
22
+ # πŸ† BabilongLeaderboard
23
+ | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) |
 
 
 
24
  """
25
  return leaderboard_md
26
 
27
 
28
+ def make_arena_leaderboard_md():
29
+ total_models = 'UNK'
30
+ leaderboard_md = f"""Total #models: **{total_models}**. Last updated: Feb 28, 2024."""
 
 
 
 
 
 
31
  return leaderboard_md
32
 
33
 
 
189
  values.append(row)
190
  values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
191
  return values
192
+
193
 
194
+ def build_leaderboard_tab():
195
+ default_md = make_default_md()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
197
+
198
+ with gr.Tabs() as tabs:
199
+ # arena table
200
+ with gr.Tab("Arena Elo", id=0):
201
+ md = make_arena_leaderboard_md()
202
+ gr.Markdown(md, elem_id="leaderboard_markdown")
203
+ gr.Dataframe(
204
+ headers=[
205
+ "Rank",
206
+ "πŸ€– Model",
207
+ "⭐ Arena Elo",
208
+ "πŸ“Š 95% CI",
209
+ "πŸ—³οΈ Votes",
210
+ "Organization",
211
+ "License",
212
+ "Knowledge Cutoff",
213
+ ],
214
+ datatype=[
215
+ "str",
216
+ "markdown",
217
+ "number",
218
+ "str",
219
+ "number",
220
+ "str",
221
+ "str",
222
+ "str",
223
+ ],
224
+ # value=arena_table_vals,
225
+ elem_id="arena_leaderboard_dataframe",
226
+ height=700,
227
+ column_widths=[50, 200, 120, 100, 100, 150, 150, 100],
228
+ wrap=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  )
230
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  return [md_1]
232
 
233
  block_css = """
 
280
  </div>
281
  """
282
 
283
+ def build_demo(leaderboard_table_file):
284
  text_size = gr.themes.sizes.text_lg
285
 
286
  with gr.Blocks(
287
+ title="Babilong leaderboard",
288
  theme=gr.themes.Base(text_size=text_size),
289
  css=block_css,
290
  ) as demo:
291
  leader_components = build_leaderboard_tab(
292
+ leaderboard_table_file, show_plot=True
293
  )
294
  return demo
295
 
296
 
297
  if __name__ == "__main__":
 
 
 
 
298
  elo_result_files = glob.glob("elo_results_*.pkl")
 
 
299
 
300
  leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
301
  # leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
302
  # leaderboard_table_file = leaderboard_table_files[-1]
303
 
304
  demo = build_demo(None, None)
305
+ demo.launch(share=True)