{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "\n", "import re" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "res_path = '../results'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "p = \"/home/jovyan/rmt/babilong-leaderboard/data/BABILong NeurIPS24 Figs - leaderboard.csv\"\n", "res_df = pd.read_csv(p)\n", "# res_df = res_df[res_df.task.isin(['qa1', 'qa2', 'qa3', 'qa4', 'qa5'])]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "lens = [0, 1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000, 500000, 1000000, 10000000]\n", "len_names = ['0K', '1K', '2K', '4K', '8K', '16K', '32K', '64K', '128K', '512K', '1M', '10M']\n", "\n", "for model_name in res_df.Model.unique():\n", " model_df = res_df[res_df.Model == model_name]\n", " model_name = re.sub('/', ' ', model_name)\n", " for i, row in model_df.iterrows():\n", " for l, ln in zip(lens, len_names):\n", " score = row[ln]\n", " # print(score)\n", " if not pd.isna(score):\n", " score = re.sub(',', '.', score)\n", " score = float(score) / 100\n", " os.makedirs(os.path.join(res_path, model_name), exist_ok=True)\n", " os.makedirs(os.path.join(res_path, model_name, row.task), exist_ok=True)\n", " path = os.path.join(res_path, model_name, row.task, f'{l}.csv')\n", " df = pd.DataFrame([{'result': score}])\n", " df.to_csv(path, index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Calculate average results" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "model_names = next(os.walk(res_path))[1]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1
02
\n", "
" ], "text/plain": [ " 1\n", "0 2" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame([{1: 2}])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'../results/GPT-3.5 fine-tuned (trained on 100 samples)/qa2'" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "task_path" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GPT-4\n", "GPT-3.5 fine-tuned (trained on 100 samples)\n", "GPT-3.5 fine-tuned (trained on 1000 samples)\n", "GPT-3.5\n", "GPT4 + RAG by segments\n", "GPT4 + RAG by sentences\n", "GPT4 + Retrieve sentences (new 100 samples)\n", "Mistral medium (xxB)\n", "Mistral\n", "GPT-2 (137M)\n", "mamba-2.8b-hf\n", "rwkv-6-world-7b\n", "v5-Eagle-7B-HF\n", "Meta-Llama-3-8B-Instruct\n", "LLaMA-2-7B-32K\n", "longchat-7b-v1.5-32k\n", "LongAlpaca-13B\n", "Llama-2-7B-32K-Instruct\n", "Mistral-7b-Instruct-v0.2\n", "Mixtral-8x7B-Instruct-v0.1\n", "Mixtral-8x22B-Instruct-v0.1\n", "activation-beacon-llama2-7b-chat\n", "Yarn-Mistral-7b-128k\n", "chatglm3-6b-128k\n", "activation-beacon-mistral-7b\n", "Phi-3-mini-128k-instruct\n", "c4ai-command-r-v01\n", "Phi-3-medium-128k-instruct\n", "~ Mamba (130M) fine-tune\n", "Llama3-ChatQA-1.5-8B + RAG\n", "~ RMT (137M) fine-tune\n", "~ ARMT (137M) fine-tune\n", "01-ai Yi-34B\n", "01-ai Yi-34B-200k\n", "01-ai Yi-9B-200k\n", "ai21labs Jamba-v0.1\n", "~ RMT-Retrieval (137M) fine-tune\n", "GPT-4 (gpt-4-0125-preview)\n", "Meta-Llama-3.1-8B-Instruct\n", "Meta-Llama-3.1-70B-Instruct\n" ] } ], "source": [ "for mn in model_names:\n", " print(mn)\n", " avg_path = os.path.join(res_path, mn, 'avg')\n", " if os.path.exists(avg_path):\n", " continue\n", " \n", " scores = {}\n", " for task_name in [f'qa{i}' for i in range(1, 6)]:\n", " task_path = os.path.join(res_path, mn, task_name)\n", " if not os.path.exists(task_path):\n", " continue\n", "\n", " filenames = next(os.walk(task_path))[2]\n", " for fn in filenames:\n", " len_name = fn.split('.')[0]\n", " df = pd.read_csv(os.path.join(task_path, fn))\n", " \n", " score = df.result.mean()\n", " if len_name not in scores:\n", " scores[len_name] = [score]\n", " else:\n", " scores[len_name].append(score)\n", "\n", " for k,v in scores.items():\n", " sc = np.mean(v)\n", " out_path = os.path.join(avg_path, k + '.csv')\n", " df = pd.DataFrame([{'result': sc}])\n", " if len(v) < 5:\n", " continue\n", " os.makedirs(avg_path, exist_ok=True)\n", " df.to_csv(out_path, index=False)\n", " print(out_path)\n", " # 1/0\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'16000': [0.58], '32000': [0.33], '4000': [0.73], '8000': [0.75]}" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scores" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }