{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "res_path = '../results'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = \"/home/jovyan/rmt/babilong-leaderboard/data/BABILong NeurIPS24 Figs - leaderboard.csv\"\n",
    "res_df = pd.read_csv(p)\n",
    "# res_df = res_df[res_df.task.isin(['qa1', 'qa2', 'qa3', 'qa4', 'qa5'])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "lens = [0, 1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000, 500000, 1000000, 10000000]\n",
    "len_names = ['0K', '1K', '2K', '4K', '8K', '16K', '32K', '64K', '128K', '512K', '1M', '10M']\n",
    "\n",
    "for model_name in res_df.Model.unique():\n",
    "    model_df = res_df[res_df.Model == model_name]\n",
    "    model_name = re.sub('/', ' ', model_name)\n",
    "    for i, row in model_df.iterrows():\n",
    "        for l, ln in zip(lens, len_names):\n",
    "            score =  row[ln]\n",
    "            # print(score)\n",
    "            if not pd.isna(score):\n",
    "                score = re.sub(',', '.', score)\n",
    "                score = float(score) / 100\n",
    "                os.makedirs(os.path.join(res_path, model_name), exist_ok=True)\n",
    "                os.makedirs(os.path.join(res_path, model_name,  row.task), exist_ok=True)\n",
    "                path = os.path.join(res_path, model_name, row.task, f'{l}.csv')\n",
    "                df = pd.DataFrame([{'result': score}])\n",
    "                df.to_csv(path, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Calculate average results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_names = next(os.walk(res_path))[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   1\n",
       "0  2"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame([{1: 2}])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'../results/GPT-3.5 fine-tuned (trained on 100 samples)/qa2'"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "task_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GPT-4\n",
      "GPT-3.5 fine-tuned (trained on 100 samples)\n",
      "GPT-3.5 fine-tuned (trained on 1000 samples)\n",
      "GPT-3.5\n",
      "GPT4 + RAG by segments\n",
      "GPT4 + RAG by sentences\n",
      "GPT4 + Retrieve sentences (new 100 samples)\n",
      "Mistral medium (xxB)\n",
      "Mistral\n",
      "GPT-2 (137M)\n",
      "mamba-2.8b-hf\n",
      "rwkv-6-world-7b\n",
      "v5-Eagle-7B-HF\n",
      "Meta-Llama-3-8B-Instruct\n",
      "LLaMA-2-7B-32K\n",
      "longchat-7b-v1.5-32k\n",
      "LongAlpaca-13B\n",
      "Llama-2-7B-32K-Instruct\n",
      "Mistral-7b-Instruct-v0.2\n",
      "Mixtral-8x7B-Instruct-v0.1\n",
      "Mixtral-8x22B-Instruct-v0.1\n",
      "activation-beacon-llama2-7b-chat\n",
      "Yarn-Mistral-7b-128k\n",
      "chatglm3-6b-128k\n",
      "activation-beacon-mistral-7b\n",
      "Phi-3-mini-128k-instruct\n",
      "c4ai-command-r-v01\n",
      "Phi-3-medium-128k-instruct\n",
      "~ Mamba (130M) fine-tune\n",
      "Llama3-ChatQA-1.5-8B + RAG\n",
      "~ RMT (137M) fine-tune\n",
      "~ ARMT (137M) fine-tune\n",
      "01-ai Yi-34B\n",
      "01-ai Yi-34B-200k\n",
      "01-ai Yi-9B-200k\n",
      "ai21labs Jamba-v0.1\n",
      "~ RMT-Retrieval (137M) fine-tune\n",
      "GPT-4 (gpt-4-0125-preview)\n",
      "Meta-Llama-3.1-8B-Instruct\n",
      "Meta-Llama-3.1-70B-Instruct\n"
     ]
    }
   ],
   "source": [
    "for mn in model_names:\n",
    "    print(mn)\n",
    "    avg_path = os.path.join(res_path, mn, 'avg')\n",
    "    if os.path.exists(avg_path):\n",
    "        continue\n",
    "    \n",
    "    scores = {}\n",
    "    for task_name in [f'qa{i}' for i in range(1, 6)]:\n",
    "        task_path = os.path.join(res_path, mn, task_name)\n",
    "        if not os.path.exists(task_path):\n",
    "            continue\n",
    "\n",
    "        filenames = next(os.walk(task_path))[2]\n",
    "        for fn in filenames:\n",
    "            len_name = fn.split('.')[0]\n",
    "            df = pd.read_csv(os.path.join(task_path, fn))\n",
    "            \n",
    "            score = df.result.mean()\n",
    "            if len_name not in scores:\n",
    "                scores[len_name] = [score]\n",
    "            else:\n",
    "                scores[len_name].append(score)\n",
    "\n",
    "    for k,v in scores.items():\n",
    "        sc = np.mean(v)\n",
    "        out_path = os.path.join(avg_path, k + '.csv')\n",
    "        df = pd.DataFrame([{'result': sc}])\n",
    "        if len(v) < 5:\n",
    "            continue\n",
    "        os.makedirs(avg_path, exist_ok=True)\n",
    "        df.to_csv(out_path, index=False)\n",
    "        print(out_path)\n",
    "        # 1/0\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'16000': [0.58], '32000': [0.33], '4000': [0.73], '8000': [0.75]}"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}