import os from datetime import datetime, timedelta from functools import lru_cache from typing import Any, List import gradio as gr import httpx import pandas as pd import plotly.express as px import polars as pl from cachetools import TTLCache, cached from datasets import Dataset, load_dataset from dotenv import load_dotenv from httpx import Client from toolz import concat, frequencies from tqdm.auto import tqdm load_dotenv() token = os.environ["HUGGINGFACE_TOKEN"] user_agent = os.environ["USER_AGENT"] user = os.environ["USER_TO_TRACK"] os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" assert token assert user_agent assert user headers = {"user-agent": user_agent, "authorization": f"Bearer {token}"} limits = httpx.Limits(max_keepalive_connections=10, max_connections=20) client = Client(headers=headers, http2=True, limits=limits, timeout=60.0) @lru_cache(maxsize=None) def get_hub_community_activity(user: str, max: int = 200_000) -> List[Any]: with tqdm() as pbar: all_data = [] i = 1 while i <= max: try: r = client.get( f"https://huggingface.co/api/recent-activity?limit=100&type=discussion&skip={i}&user={user}", ) activity = r.json()["recentActivity"] if not activity: break all_data.append(activity) if len(all_data) % 1000 == 0: # print(f"Length of all_data: {len(all_data)}") pbar.write(f"Length of all_data: {len(all_data)}") i += 100 pbar.update(100) except Exception as e: print(e) continue return list(concat(all_data)) # def get_hub_community_activity(user: str) -> List[Any]: # all_data = [] # for i in range(1, 2000, 100): # r = httpx.get( # f"https://huggingface.co/api/recent-activity?limit=100&type=discussion&skip={i}&user={user}" # ) # activity = r.json()["recentActivity"] # all_data.append(activity) # return list(concat(all_data)) def parse_date_time(date_time: str) -> datetime: return datetime.strptime(date_time, "%Y-%m-%dT%H:%M:%S.%fZ") def parse_pr_data(data): data = data["discussionData"] createdAt = parse_date_time(data["createdAt"]) pr_number = data["num"] status = data["status"] repo_id = data["repo"]["name"] repo_type = data["repo"]["type"] isPullRequest = data["isPullRequest"] return { "createdAt": createdAt, "pr_number": pr_number, "status": status, "repo_id": repo_id, "type": repo_type, "isPullRequest": isPullRequest, } @cached(cache=TTLCache(maxsize=1000, ttl=timedelta(minutes=30), timer=datetime.now)) def update_data(): try: previous_df = pl.DataFrame( load_dataset(f"librarian-bot/{user}-stats", split="train").data.table ) except FileNotFoundError: previous_df = pl.DataFrame() data = get_hub_community_activity(user) data = [parse_pr_data(d) for d in data] update_df = pl.DataFrame(data) df = pl.concat([previous_df, update_df]).unique() if len(df) != len(previous_df): Dataset(df.to_arrow()).push_to_hub(f"{user}-stats", token=token) return df # def get_pr_status(): # df = update_data() # df = df.filter(pl.col("isPullRequest") is True) # return df.select(pl.col("status").value_counts()) # # return frequencies(x["status"] for x in pr_data) @lru_cache(maxsize=512) def get_pr_status(user: str): all_data = get_hub_community_activity(user) pr_data = ( x["discussionData"] for x in all_data if x["discussionData"]["isPullRequest"] ) return frequencies(x["status"] for x in pr_data) def create_pie(): frequencies = get_pr_status(user) df = pd.DataFrame({"status": frequencies.keys(), "number": frequencies.values()}) return px.pie(df, values="number", names="status", template="seaborn") def group_status_by_pr_number(): all_data = get_hub_community_activity(user) all_data = [parse_pr_data(d) for d in all_data] return ( pl.DataFrame(all_data).groupby("status").agg(pl.mean("pr_number")).to_pandas() ) def plot_over_time(): all_data = get_hub_community_activity(user) all_data = [parse_pr_data(d) for d in all_data] df = pl.DataFrame(all_data).with_columns(pl.col("createdAt").cast(pl.Date)) df = df.pivot( values=["status"], index=["createdAt"], columns=["status"], aggregate_function="count", ) df = df.fill_null(0) df = df.with_columns(pl.sum(["open", "closed", "merged"])).sort("createdAt") df = df.to_pandas().set_index("createdAt").cumsum() return px.line(df, x=df.index, y=[c for c in df.columns if c != "sum"]) create_pie() with gr.Blocks() as demo: # frequencies = get_pr_status("librarian-bot") gr.Markdown(f"# {user} PR Stats") gr.Markdown(f"Total prs and issues opened by {user}: {len(update_data()):,}") # gr.Markdown(f"Total PRs opened: {sum(frequencies.values())}") with gr.Column(): gr.Markdown("## Pull requests status") gr.Markdown( "The below pie chart shows the percentage of pull requests made by" " librarian bot that are open, closed or merged" ) gr.Plot(create_pie()) with gr.Column(): gr.Markdown("Pull requests opened, closed and merged over time (cumulative)") gr.Plot(plot_over_time()) with gr.Column(): gr.Markdown("## Pull requests status by PR number") gr.DataFrame(group_status_by_pr_number()) demo.launch(debug=True)