{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "215f6d68", "metadata": { "papermill": { "duration": 0.00298, "end_time": "2023-09-29T06:34:37.429814", "exception": false, "start_time": "2023-09-29T06:34:37.426834", "status": "completed" }, "tags": [] }, "source": [ "# RWKV v5 multi-size training experiment\n", "\n", "**Note:** This project assumes you have the rwkv-infctx conda env setup" ] }, { "attachments": {}, "cell_type": "markdown", "id": "bf0cf97c", "metadata": { "papermill": { "duration": 0.002312, "end_time": "2023-09-29T06:34:37.435725", "exception": false, "start_time": "2023-09-29T06:34:37.433413", "status": "completed" }, "tags": [] }, "source": [ "# Basic Setup" ] }, { "cell_type": "code", "execution_count": 1, "id": "98d95606", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T06:34:37.440037Z", "iopub.status.busy": "2023-09-29T06:34:37.439753Z", "iopub.status.idle": "2023-09-29T06:34:38.107502Z", "shell.execute_reply": "2023-09-29T06:34:38.106666Z" }, "papermill": { "duration": 0.672001, "end_time": "2023-09-29T06:34:38.109558", "exception": false, "start_time": "2023-09-29T06:34:37.437557", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# First lets setup the various directories, and init the model\n", "!mkdir -p ../../../../model/\n", "!mkdir -p ../../../../datapath/\n", "!mkdir -p ../../../../checkpoint/" ] }, { "cell_type": "code", "execution_count": 2, "id": "d799a503", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T06:34:38.116182Z", "iopub.status.busy": "2023-09-29T06:34:38.115939Z", "iopub.status.idle": "2023-09-29T06:34:38.124075Z", "shell.execute_reply": "2023-09-29T06:34:38.123428Z" }, "papermill": { "duration": 0.012774, "end_time": "2023-09-29T06:34:38.125078", "exception": false, "start_time": "2023-09-29T06:34:38.112304", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DEEPSPEED_STRAT: deepspeed_stage_1\n", "ENABLE_WANDB: True\n", "GPU_DEVICES: auto\n", "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train\n", "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n" ] } ], "source": [ "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", "GPU_DEVICES=\"auto\"\n", "ENABLE_WANDB=True\n", "\n", "EMBED_SCALE=0.01\n", "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", "\n", "LAYER_COUNT=6\n", "EMBED_SIZE=2048\n", "\n", "WANDB_PREFIX=f\"[Multi-size] v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE}\"\n", "FILENAME_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE_LABEL}\"\n", "\n", "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", "\n", "if ENABLE_WANDB:\n", " WANDB_MODE=\"online\"\n", "else:\n", " WANDB_MODE=\"disabled\"\n", "\n", "# Computing the notebook, and various paths\n", "import os\n", "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n", "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", "\n", "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", "print(\"PROJECT_DIR:\", PROJECT_DIR)" ] }, { "cell_type": "code", "execution_count": 3, "id": "e4204bbd", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T06:34:38.128341Z", "iopub.status.busy": "2023-09-29T06:34:38.128199Z", "iopub.status.idle": "2023-09-29T06:34:57.534218Z", "shell.execute_reply": "2023-09-29T06:34:57.533501Z" }, "papermill": { "duration": 19.409767, "end_time": "2023-09-29T06:34:57.536209", "exception": false, "start_time": "2023-09-29T06:34:38.126442", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2023-09-29 06:34:40,927] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", "---- Initializing model ----\r\n", "No of layers: 6\r\n", "Embedding size: 2048\r\n", "Output model path: ../model/v5-L6-D2048-E0_01-neox-v5base-init.pth\r\n", "Vocab size: 50277\r\n", "Emb scale: 0.01\r\n", "Note: this process takes a significant time (and ram) for large models\r\n", "---- ----- ----\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "50277 2048 -0.01 emb.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.0.att.output.weight\r\n", "7168 2048 1.0 blocks.0.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.0.ffn.receptance.weight\r\n", "2048 7168 0 blocks.0.ffn.value.weight\r\n", "2048 2048 1.0 blocks.1.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.1.att.output.weight\r\n", "7168 2048 1.0 blocks.1.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.1.ffn.receptance.weight\r\n", "2048 7168 0 blocks.1.ffn.value.weight\r\n", "2048 2048 1.0 blocks.2.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.2.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.2.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.2.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.2.att.output.weight\r\n", "7168 2048 1.0 blocks.2.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.2.ffn.receptance.weight\r\n", "2048 7168 0 blocks.2.ffn.value.weight\r\n", "2048 2048 1.0 blocks.3.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.3.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.3.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.3.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.3.att.output.weight\r\n", "7168 2048 1.0 blocks.3.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.3.ffn.receptance.weight\r\n", "2048 7168 0 blocks.3.ffn.value.weight\r\n", "2048 2048 1.0 blocks.4.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.4.att.output.weight\r\n", "7168 2048 1.0 blocks.4.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.4.ffn.receptance.weight\r\n", "2048 7168 0 blocks.4.ffn.value.weight\r\n", "2048 2048 1.0 blocks.5.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.5.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "7168 2048 1.0 blocks.5.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.5.ffn.receptance.weight\r\n", "2048 7168 0 blocks.5.ffn.value.weight\r\n", "50277 2048 0.5 head.weight\r\n" ] } ], "source": [ "# Init the model\n", "!cd \"{TRAINER_DIR}\" && \\\n", " python3 ./init_model.py \\\n", " --n_layer {LAYER_COUNT} --n_embd {EMBED_SIZE} \\\n", " --emb-scale \"{EMBED_SCALE}\" \\\n", " --vocab_size neox --skip-if-exists \\\n", " \"../model/{FILENAME_PREFIX}-neox-v5base-init.pth\"" ] }, { "cell_type": "markdown", "id": "25aa156c", "metadata": { "papermill": { "duration": 0.004287, "end_time": "2023-09-29T06:34:57.545982", "exception": false, "start_time": "2023-09-29T06:34:57.541695", "status": "completed" }, "tags": [] }, "source": [ "## Enwiki Stage 1 : Foundation 4k model training" ] }, { "cell_type": "code", "execution_count": 4, "id": "dfb884c6", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T06:34:57.552286Z", "iopub.status.busy": "2023-09-29T06:34:57.552023Z", "iopub.status.idle": "2023-09-29T06:35:06.302944Z", "shell.execute_reply": "2023-09-29T06:35:06.302095Z" }, "papermill": { "duration": 8.756445, "end_time": "2023-09-29T06:35:06.305010", "exception": false, "start_time": "2023-09-29T06:34:57.548565", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r", "Saving the dataset (0/3 shards): 0%| | 0/54405 [00:00\r\n", " cli_main()\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 253, in cli_main\r\n", " LightningCLI(\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 353, in __init__\r\n", " self._run_subcommand(self.subcommand)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 642, in _run_subcommand\r\n", " fn(**fn_kwargs)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 529, in fit\r\n", " call._call_and_handle_interrupt(\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 41, in _call_and_handle_interrupt\r\n", " return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/launchers/subprocess_script.py\", line 91, in launch\r\n", " return function(*args, **kwargs)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 568, in _fit_impl\r\n", " self._run(model, ckpt_path=ckpt_path)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 925, in _run\r\n", " self._data_connector.prepare_data()\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py\", line 94, in prepare_data\r\n", " call._call_lightning_datamodule_hook(trainer, \"prepare_data\")\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 164, in _call_lightning_datamodule_hook\r\n", " return fn(*args, **kwargs)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/data.py\", line 549, in prepare_data\r\n", " prepare_data_static(**self._init_locals)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/data.py\", line 464, in prepare_data_static\r\n", " src_dataset[\"train\"] = src_dataset[\"train\"].select(range(offset_val, offset_val + length_val))\r\n", "TypeError: 'float' object cannot be interpreted as an integer\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33m[Multi-size] v5-L6-D2048-E0.01 - Enwiki-4k Part 1 (train-ctx=4k, deepspeed_stage_1)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/nua9z0t5\u001b[0m\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v2\u001b[0m\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20230929_063511-nua9z0t5/logs\u001b[0m\r\n" ] } ], "source": [ "# Start the foundation model training\n", "!cd \"{TRAINER_DIR}\" && \\\n", " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", " python3 lightning_trainer.py fit \\\n", " -c \"{NOTEBOOK_DIR}/enwiki-4k-part1.yaml\" \\\n", " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Part 1 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", " --trainer.devices=\"{GPU_DEVICES}\" \\\n", " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/\" \\\n", " --model.load_model=\"../model/{FILENAME_PREFIX}-neox-v5base-init.pth\" \\\n", " --model.ctx_len=4096 \\\n", " --model.bptt_learning_range=1" ] }, { "cell_type": "code", "execution_count": 6, "id": "e03ffed6", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T06:40:32.715392Z", "iopub.status.busy": "2023-09-29T06:40:32.715097Z", "iopub.status.idle": "2023-09-29T06:40:35.440958Z", "shell.execute_reply": "2023-09-29T06:40:35.440200Z" }, "papermill": { "duration": 2.797683, "end_time": "2023-09-29T06:40:35.442702", "exception": false, "start_time": "2023-09-29T06:40:32.645019", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2023-09-29 06:40:34,511] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Traceback (most recent call last):\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in \r\n", " convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n", " state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n", " raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n", "ValueError: Unable to find 'latest' file at ../checkpoint/v5-L6-D2048-E0_01-enwiki-4k-p1/last.ckpt/latest\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ls: cannot access '../model/v5-L6-D2048-E0_01-enwiki-4k-p1.pth': No such file or directory\r\n" ] } ], "source": [ "# Lets export the model from the checkpoint\n", "!cd \"{TRAINER_DIR}\" && \\\n", " python3 export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"bf16\"\n", "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\"" ] }, { "cell_type": "code", "execution_count": 7, "id": "b2d5fe57", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T06:40:35.588027Z", "iopub.status.busy": "2023-09-29T06:40:35.587772Z", "iopub.status.idle": "2023-09-29T06:40:39.405717Z", "shell.execute_reply": "2023-09-29T06:40:39.404915Z" }, "papermill": { "duration": 3.890278, "end_time": "2023-09-29T06:40:39.407623", "exception": false, "start_time": "2023-09-29T06:40:35.517345", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2023-09-29 06:40:38,394] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", "Traceback (most recent call last):\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py\", line 52, in \r\n", " model = SimpleRWKV(MODEL_PATH, device=DEVICE)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1420, in __init__\r\n", " self.model = RWKV(**model_config)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n", " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n", "ValueError: load_model file '../model/v5-L6-D2048-E0_01-enwiki-4k-p1.pth' does not exist\r\n" ] } ], "source": [ "# # Lets do a quick dragon prompt validation\n", "!cd \"{INFERENCE_DIR}\" && \\\n", " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"cuda fp32\"" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "papermill": { "default_parameters": {}, "duration": 363.056705, "end_time": "2023-09-29T06:40:39.597218", "environment_variables": {}, "exception": null, "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb", "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb", "parameters": {}, "start_time": "2023-09-29T06:34:36.540513", "version": "2.4.0" } }, "nbformat": 4, "nbformat_minor": 5 }