{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "b33301f1", "metadata": { "papermill": { "duration": 0.002639, "end_time": "2023-09-29T06:40:35.355878", "exception": false, "start_time": "2023-09-29T06:40:35.353239", "status": "completed" }, "tags": [] }, "source": [ "# RWKV v5 multi-size training experiment\n", "\n", "**Note:** This project assumes you have the rwkv-infctx conda env setup" ] }, { "attachments": {}, "cell_type": "markdown", "id": "7f0c7442", "metadata": { "papermill": { "duration": 0.001897, "end_time": "2023-09-29T06:40:35.361675", "exception": false, "start_time": "2023-09-29T06:40:35.359778", "status": "completed" }, "tags": [] }, "source": [ "# Basic Setup" ] }, { "cell_type": "code", "execution_count": 1, "id": "5697b559", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T06:40:35.367571Z", "iopub.status.busy": "2023-09-29T06:40:35.367056Z", "iopub.status.idle": "2023-09-29T06:40:36.120936Z", "shell.execute_reply": "2023-09-29T06:40:36.120020Z" }, "papermill": { "duration": 0.75911, "end_time": "2023-09-29T06:40:36.122941", "exception": false, "start_time": "2023-09-29T06:40:35.363831", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# First lets setup the various directories, and init the model\n", "!mkdir -p ../../../../model/\n", "!mkdir -p ../../../../datapath/\n", "!mkdir -p ../../../../checkpoint/" ] }, { "cell_type": "code", "execution_count": 2, "id": "a6de4e7e", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T06:40:36.129456Z", "iopub.status.busy": "2023-09-29T06:40:36.128952Z", "iopub.status.idle": "2023-09-29T06:40:36.137224Z", "shell.execute_reply": "2023-09-29T06:40:36.136398Z" }, "papermill": { "duration": 0.013484, "end_time": "2023-09-29T06:40:36.138880", "exception": false, "start_time": "2023-09-29T06:40:36.125396", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DEEPSPEED_STRAT: deepspeed_stage_1\n", "ENABLE_WANDB: True\n", "GPU_DEVICES: auto\n", "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train\n", "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n" ] } ], "source": [ "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", "GPU_DEVICES=\"auto\"\n", "ENABLE_WANDB=True\n", "\n", "EMBED_SCALE=0.01\n", "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", "\n", "LAYER_COUNT=12\n", "EMBED_SIZE=2048\n", "\n", "WANDB_PREFIX=f\"[Multi-size] v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE}\"\n", "FILENAME_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE_LABEL}\"\n", "\n", "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", "\n", "if ENABLE_WANDB:\n", " WANDB_MODE=\"online\"\n", "else:\n", " WANDB_MODE=\"disabled\"\n", "\n", "# Computing the notebook, and various paths\n", "import os\n", "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n", "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", "\n", "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", "print(\"PROJECT_DIR:\", PROJECT_DIR)" ] }, { "cell_type": "code", "execution_count": 3, "id": "22f457d9", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T06:40:36.145747Z", "iopub.status.busy": "2023-09-29T06:40:36.145217Z", "iopub.status.idle": "2023-09-29T06:41:21.917242Z", "shell.execute_reply": "2023-09-29T06:41:21.916099Z" }, "papermill": { "duration": 45.777858, "end_time": "2023-09-29T06:41:21.919315", "exception": false, "start_time": "2023-09-29T06:40:36.141457", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2023-09-29 06:40:40,584] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", "---- Initializing model ----\r\n", "No of layers: 12\r\n", "Embedding size: 2048\r\n", "Output model path: ../model/v5-L12-D2048-E0_01-neox-v5base-init.pth\r\n", "Vocab size: 50277\r\n", "Emb scale: 0.01\r\n", "Note: this process takes a significant time (and ram) for large models\r\n", "---- ----- ----\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "50277 2048 -0.01 emb.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.0.att.output.weight\r\n", "7168 2048 1.0 blocks.0.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.0.ffn.receptance.weight\r\n", "2048 7168 0 blocks.0.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.1.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "7168 2048 1.0 blocks.1.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.1.ffn.receptance.weight\r\n", "2048 7168 0 blocks.1.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.2.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.2.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.2.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.2.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.2.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "7168 2048 1.0 blocks.2.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.2.ffn.receptance.weight\r\n", "2048 7168 0 blocks.2.ffn.value.weight\r\n", "2048 2048 1.0 blocks.3.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.3.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.3.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.3.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.3.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "7168 2048 1.0 blocks.3.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.3.ffn.receptance.weight\r\n", "2048 7168 0 blocks.3.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.4.att.output.weight\r\n", "7168 2048 1.0 blocks.4.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.4.ffn.receptance.weight\r\n", "2048 7168 0 blocks.4.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.5.att.output.weight\r\n", "7168 2048 1.0 blocks.5.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.5.ffn.receptance.weight\r\n", "2048 7168 0 blocks.5.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.6.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.6.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.6.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.6.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.6.att.output.weight\r\n", "7168 2048 1.0 blocks.6.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.6.ffn.receptance.weight\r\n", "2048 7168 0 blocks.6.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.7.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.7.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.7.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.7.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.7.att.output.weight\r\n", "7168 2048 1.0 blocks.7.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.7.ffn.receptance.weight\r\n", "2048 7168 0 blocks.7.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.8.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.8.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.8.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.8.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.8.att.output.weight\r\n", "7168 2048 1.0 blocks.8.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.8.ffn.receptance.weight\r\n", "2048 7168 0 blocks.8.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.9.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.9.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.9.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.9.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.9.att.output.weight\r\n", "7168 2048 1.0 blocks.9.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.9.ffn.receptance.weight\r\n", "2048 7168 0 blocks.9.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.10.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.10.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.10.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.10.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.10.att.output.weight\r\n", "7168 2048 1.0 blocks.10.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.10.ffn.receptance.weight\r\n", "2048 7168 0 blocks.10.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.11.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.11.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.11.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.11.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.11.att.output.weight\r\n", "7168 2048 1.0 blocks.11.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.11.ffn.receptance.weight\r\n", "2048 7168 0 blocks.11.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "50277 2048 0.5 head.weight\r\n" ] } ], "source": [ "# Init the model\n", "!cd \"{TRAINER_DIR}\" && \\\n", " python3 ./init_model.py \\\n", " --n_layer {LAYER_COUNT} --n_embd {EMBED_SIZE} \\\n", " --emb-scale \"{EMBED_SCALE}\" \\\n", " --vocab_size neox --skip-if-exists \\\n", " \"../model/{FILENAME_PREFIX}-neox-v5base-init.pth\"" ] }, { "cell_type": "markdown", "id": "fde86502", "metadata": { "papermill": { "duration": 0.006163, "end_time": "2023-09-29T06:41:21.931885", "exception": false, "start_time": "2023-09-29T06:41:21.925722", "status": "completed" }, "tags": [] }, "source": [ "## Enwiki Stage 1 : Foundation 4k model training" ] }, { "cell_type": "code", "execution_count": 4, "id": "a862f05f", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T06:41:21.946957Z", "iopub.status.busy": "2023-09-29T06:41:21.946097Z", "iopub.status.idle": "2023-09-29T06:41:33.126385Z", "shell.execute_reply": "2023-09-29T06:41:33.125478Z" }, "papermill": { "duration": 11.190379, "end_time": "2023-09-29T06:41:33.128643", "exception": false, "start_time": "2023-09-29T06:41:21.938264", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r", "Saving the dataset (0/3 shards): 0%| | 0/54401 [00:00\r\n", " cli_main()\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 253, in cli_main\r\n", " LightningCLI(\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 353, in __init__\r\n", " self._run_subcommand(self.subcommand)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 642, in _run_subcommand\r\n", " fn(**fn_kwargs)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 529, in fit\r\n", " call._call_and_handle_interrupt(\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 41, in _call_and_handle_interrupt\r\n", " return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/launchers/subprocess_script.py\", line 91, in launch\r\n", " return function(*args, **kwargs)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 568, in _fit_impl\r\n", " self._run(model, ckpt_path=ckpt_path)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 925, in _run\r\n", " self._data_connector.prepare_data()\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py\", line 94, in prepare_data\r\n", " call._call_lightning_datamodule_hook(trainer, \"prepare_data\")\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 164, in _call_lightning_datamodule_hook\r\n", " return fn(*args, **kwargs)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/data.py\", line 549, in prepare_data\r\n", " prepare_data_static(**self._init_locals)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/data.py\", line 464, in prepare_data_static\r\n", " src_dataset[\"train\"] = src_dataset[\"train\"].select(range(offset_val, offset_val + length_val))\r\n", "TypeError: 'float' object cannot be interpreted as an integer\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33m[Multi-size] v5-L12-D2048-E0.01 - Enwiki-4k Part 1 (train-ctx=4k, deepspeed_stage_1)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/udijamu6\u001b[0m\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v3\u001b[0m\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20230929_064140-udijamu6/logs\u001b[0m\r\n" ] } ], "source": [ "# Start the foundation model training\n", "!cd \"{TRAINER_DIR}\" && \\\n", " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", " python3 lightning_trainer.py fit \\\n", " -c \"{NOTEBOOK_DIR}/enwiki-4k-part1.yaml\" \\\n", " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Part 1 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", " --trainer.devices=\"{GPU_DEVICES}\" \\\n", " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/\" \\\n", " --model.load_model=\"../model/{FILENAME_PREFIX}-neox-v5base-init.pth\" \\\n", " --model.ctx_len=4096 \\\n", " --model.bptt_learning_range=1" ] }, { "cell_type": "code", "execution_count": 6, "id": "9dcc8aa0", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T06:46:53.049378Z", "iopub.status.busy": "2023-09-29T06:46:53.048820Z", "iopub.status.idle": "2023-09-29T06:46:56.789000Z", "shell.execute_reply": "2023-09-29T06:46:56.787968Z" }, "papermill": { "duration": 3.854596, "end_time": "2023-09-29T06:46:56.791829", "exception": false, "start_time": "2023-09-29T06:46:52.937233", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2023-09-29 06:46:55,390] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Traceback (most recent call last):\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in \r\n", " convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n", " state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n", "ValueError: Unable to find 'latest' file at ../checkpoint/v5-L12-D2048-E0_01-enwiki-4k-p1/last.ckpt/latest\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ls: cannot access '../model/v5-L12-D2048-E0_01-enwiki-4k-p1.pth': No such file or directory\r\n" ] } ], "source": [ "# Lets export the model from the checkpoint\n", "!cd \"{TRAINER_DIR}\" && \\\n", " python3 export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"bf16\"\n", "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\"" ] }, { "cell_type": "code", "execution_count": 7, "id": "fbac8551", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T06:46:57.029599Z", "iopub.status.busy": "2023-09-29T06:46:57.029240Z", "iopub.status.idle": "2023-09-29T06:47:03.411064Z", "shell.execute_reply": "2023-09-29T06:47:03.409916Z" }, "papermill": { "duration": 6.495854, "end_time": "2023-09-29T06:47:03.413417", "exception": false, "start_time": "2023-09-29T06:46:56.917563", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2023-09-29 06:47:01,373] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Traceback (most recent call last):\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py\", line 52, in \r\n", " model = SimpleRWKV(MODEL_PATH, device=DEVICE)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1420, in __init__\r\n", " self.model = RWKV(**model_config)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n", " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n", "ValueError: load_model file '../model/v5-L12-D2048-E0_01-enwiki-4k-p1.pth' does not exist\r\n" ] } ], "source": [ "# # Lets do a quick dragon prompt validation\n", "!cd \"{INFERENCE_DIR}\" && \\\n", " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"cuda fp32\"" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "papermill": { "default_parameters": {}, "duration": 389.827856, "end_time": "2023-09-29T06:47:03.946497", "environment_variables": {}, "exception": null, "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb", "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb", "parameters": {}, "start_time": "2023-09-29T06:40:34.118641", "version": "2.4.0" } }, "nbformat": 4, "nbformat_minor": 5 }