human_eval_llm_leaderboard

Runtime error

App Files Files Community

human_eval_llm_leaderboard / app.py

Clémentine

init

728a44a over 1 year ago

raw

history blame contribute delete

No virus

5.5 kB

	import json
	import os
	from datetime import datetime, timezone


	import gradio as gr
	import numpy as np
	import pandas as pd
	from apscheduler.schedulers.background import BackgroundScheduler
	from huggingface_hub import HfApi

	from src.assets.text_content import *
	from src.elo_leaderboard.load_results import get_elo_plots, get_elo_results_dicts
	from src.assets.css_html_js import custom_css, get_window_url_params # left in case you need them
	from src.utils_display import EloEvalColumn, fields, styled_error, styled_warning, styled_message
	from src.init import load_all_info_from_hub

	# clone / pull the lmeh eval data
	H4_TOKEN = os.environ.get("H4_TOKEN", None)
	HUMAN_EVAL_REPO = "HuggingFaceH4/scale-human-eval"
	GPT_4_EVAL_REPO = "HuggingFaceH4/open_llm_leaderboard_oai_evals"
	IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
	ADD_PLOTS = False

	EVAL_REQUESTS_PATH = "auto_evals/eval_requests"

	api = HfApi()


	def restart_space():
	api.restart_space(
	repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
	)

	human_eval_repo, gpt_4_eval_repo = load_all_info_from_hub(HUMAN_EVAL_REPO, GPT_4_EVAL_REPO)

	ELO_COLS = [c.name for c in fields(EloEvalColumn)]
	ELO_TYPES = [c.type for c in fields(EloEvalColumn)]
	ELO_SORT_COL = EloEvalColumn.gpt4.name


	def has_no_nan_values(df, columns):
	return df[columns].notna().all(axis=1)


	def has_nan_values(df, columns):
	return df[columns].isna().any(axis=1)


	def get_elo_leaderboard(df_instruct, df_code_instruct, tie_allowed=False):
	if human_eval_repo:
	print("Pulling human_eval_repo changes")
	human_eval_repo.git_pull()

	all_data = get_elo_results_dicts(df_instruct, df_code_instruct, tie_allowed)
	dataframe = pd.DataFrame.from_records(all_data)
	dataframe = dataframe.sort_values(by=ELO_SORT_COL, ascending=False)
	dataframe = dataframe[ELO_COLS]
	return dataframe


	def get_elo_elements():
	df_instruct = pd.read_json("human_evals/without_code.json")
	df_code_instruct = pd.read_json("human_evals/with_code.json")

	elo_leaderboard = get_elo_leaderboard(
	df_instruct, df_code_instruct, tie_allowed=False
	)
	elo_leaderboard_with_tie_allowed = get_elo_leaderboard(
	df_instruct, df_code_instruct, tie_allowed=True
	)
	plot_1, plot_2, plot_3, plot_4 = get_elo_plots(
	df_instruct, df_code_instruct, tie_allowed=False
	)

	return (
	elo_leaderboard,
	elo_leaderboard_with_tie_allowed,
	plot_1,
	plot_2,
	plot_3,
	plot_4,
	)

	(
	elo_leaderboard,
	elo_leaderboard_with_tie_allowed,
	plot_1,
	plot_2,
	plot_3,
	plot_4,
	) = get_elo_elements()


	demo = gr.Blocks(css=custom_css)
	with demo:
	gr.HTML(TITLE)
	with gr.Row():
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

	with gr.Column():
	with gr.Row():
	with gr.Column(scale=2):
	gr.Markdown(HUMAN_GPT_EVAL_TEXT, elem_classes="markdown-text")
	with gr.Column(scale=1):
	gr.Image(
	"src/assets/scale-hf-logo.png", elem_id="scale-logo", show_label=False
	)
	gr.Markdown("## No tie allowed")
	elo_leaderboard_table = gr.components.Dataframe(
	value=elo_leaderboard,
	headers=ELO_COLS,
	datatype=ELO_TYPES,
	max_rows=5,
	)

	gr.Markdown("## Tie allowed*")
	elo_leaderboard_table_with_tie_allowed = gr.components.Dataframe(
	value=elo_leaderboard_with_tie_allowed,
	headers=ELO_COLS,
	datatype=ELO_TYPES,
	max_rows=5,
	)

	gr.Markdown(
	"\* Results when the scores of 4 and 5 were treated as ties.",
	elem_classes="markdown-text",
	)

	gr.Markdown(
	"Let us know in [this discussion](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/65) which models we should add!",
	elem_id="models-to-add-text",
	)

	if ADD_PLOTS:
	with gr.Box():
	visualization_title = gr.HTML(VISUALIZATION_TITLE)
	with gr.Row():
	with gr.Column():
	gr.Markdown(f"#### Figure 1: {PLOT_1_TITLE}")
	plot_1 = gr.Plot(plot_1, show_label=False)
	with gr.Column():
	gr.Markdown(f"#### Figure 2: {PLOT_2_TITLE}")
	plot_2 = gr.Plot(plot_2, show_label=False)
	with gr.Row():
	with gr.Column():
	gr.Markdown(f"#### Figure 3: {PLOT_3_TITLE}")
	plot_3 = gr.Plot(plot_3, show_label=False)
	with gr.Column():
	gr.Markdown(f"#### Figure 4: {PLOT_4_TITLE}")
	plot_4 = gr.Plot(plot_4, show_label=False)

	with gr.Row():
	with gr.Column():
	with gr.Accordion("📙 Citation", open=False):
	citation_button = gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	elem_id="citation-button",
	).style(show_copy_button=True)
	with gr.Column():
	with gr.Accordion("✨ CHANGELOG", open=False):
	changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")



	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=3600)
	scheduler.start()
	demo.queue(concurrency_count=40).launch()