{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "d16d19a7", "metadata": { "papermill": { "duration": 0.004251, "end_time": "2023-09-06T01:53:47.272181", "exception": false, "start_time": "2023-09-06T01:53:47.267930", "status": "completed" }, "tags": [] }, "source": [ "# RWKV v5\n", "\n", "Simple memory training for a small model\n", "\n", "**Note:** This project assumes you have the rwkv-infctx conda env setup" ] }, { "attachments": {}, "cell_type": "markdown", "id": "e7eaa6b5", "metadata": { "papermill": { "duration": 0.002635, "end_time": "2023-09-06T01:53:47.277736", "exception": false, "start_time": "2023-09-06T01:53:47.275101", "status": "completed" }, "tags": [] }, "source": [ "# Basic Setup" ] }, { "cell_type": "code", "execution_count": 1, "id": "43c42c12", "metadata": { "execution": { "iopub.execute_input": "2023-09-06T01:53:47.281975Z", "iopub.status.busy": "2023-09-06T01:53:47.281689Z", "iopub.status.idle": "2023-09-06T01:53:48.160757Z", "shell.execute_reply": "2023-09-06T01:53:48.159854Z" }, "papermill": { "duration": 0.883492, "end_time": "2023-09-06T01:53:48.162633", "exception": false, "start_time": "2023-09-06T01:53:47.279141", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CITATION.cff RWKV-v4wavenet\t RWKV-v5headsize32 checkpoint\tnotebook\r\n", "LICENSE RWKV-v5\t\t RWKV-v5r2\t datapath\toutput\r\n", "README.md RWKV-v5altwavenet RWKV-v5rstack\t docker\r\n", "RWKV-v4neo RWKV-v5headsize2x RWKV-v5wavenet model\r\n" ] } ], "source": [ "# First lets setup the various directories, and init the model\n", "!ls ../../../../../\n", "!mkdir -p ../../../../../model/\n", "!mkdir -p ../../../../../datapath/\n", "!mkdir -p ../../../../../checkpoint/" ] }, { "cell_type": "code", "execution_count": 2, "id": "4f145f3c", "metadata": { "execution": { "iopub.execute_input": "2023-09-06T01:53:48.169937Z", "iopub.status.busy": "2023-09-06T01:53:48.169610Z", "iopub.status.idle": "2023-09-06T01:53:50.281603Z", "shell.execute_reply": "2023-09-06T01:53:50.280695Z" }, "papermill": { "duration": 2.117825, "end_time": "2023-09-06T01:53:50.283651", "exception": false, "start_time": "2023-09-06T01:53:48.165826", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\r\n", "\u001b[0m" ] } ], "source": [ "# Additional dependencies for eval stuff\n", "!pip install -q aiocsv aiofiles" ] }, { "cell_type": "code", "execution_count": 3, "id": "e478c7e4", "metadata": { "execution": { "iopub.execute_input": "2023-09-06T01:53:50.291201Z", "iopub.status.busy": "2023-09-06T01:53:50.290951Z", "iopub.status.idle": "2023-09-06T01:53:50.300137Z", "shell.execute_reply": "2023-09-06T01:53:50.299422Z" }, "papermill": { "duration": 0.014857, "end_time": "2023-09-06T01:53:50.301805", "exception": false, "start_time": "2023-09-06T01:53:50.286948", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DEEPSPEED_STRAT: deepspeed_stage_1\n", "ENABLE_WANDB: True\n", "GPU_DEVICES: auto\n", "DIR_NAME: L12-D2048-E1e-1-ctx4k\n", "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k\n", "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n" ] } ], "source": [ "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", "GPU_DEVICES=\"auto\"\n", "ENABLE_WANDB=True\n", "\n", "# Layer count and embed dim to start with\n", "LAYER_COUNT=12\n", "EMBED_DIM=2048\n", "\n", "EMBED_SCALE=0.1\n", "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", "\n", "WANDB_PREFIX=f\"v5r3-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n", "FILENAME_PREFIX=f\"v5r3-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"\n", "\n", "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", "\n", "if ENABLE_WANDB:\n", " WANDB_MODE=\"online\"\n", "else:\n", " WANDB_MODE=\"disabled\"\n", "\n", "# Computing the notebook, and various paths\n", "import os\n", "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", "CONFIG_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../\"))\n", "PROJECT_DIR=os.path.abspath(os.path.join(CONFIG_DIR, \"../../../../\"))\n", "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", "\n", "# Get the notebook dir name\n", "DIR_NAME=os.path.basename(NOTEBOOK_DIR)\n", "\n", "# Log names and dir\n", "print(\"DIR_NAME:\", DIR_NAME)\n", "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", "print(\"PROJECT_DIR:\", PROJECT_DIR)" ] }, { "cell_type": "code", "execution_count": 4, "id": "c0d1081e", "metadata": { "execution": { "iopub.execute_input": "2023-09-06T01:53:50.309388Z", "iopub.status.busy": "2023-09-06T01:53:50.309130Z", "iopub.status.idle": "2023-09-06T01:53:50.530630Z", "shell.execute_reply": "2023-09-06T01:53:50.529804Z" }, "papermill": { "duration": 0.227251, "end_time": "2023-09-06T01:53:50.532377", "exception": false, "start_time": "2023-09-06T01:53:50.305126", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/usr/bin/sh: 1: cd: can't cd to {TRAINER_DIR}\r\n" ] } ], "source": [ "# Init the model\n", "!cd \"{TRAINER_DIR}\" && \\\n", " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python3 ./init_model.py \\\n", " --n_layer \"{LAYER_COUNT}\" --n_embd \"{EMBED_DIM}\" \\\n", " --emb-scale \"{EMBED_SCALE}\" \\\n", " --vocab_size neox --skip-if-exists \\\n", " \"../model/{FILENAME_PREFIX}-neox-init.pth\"" ] }, { "cell_type": "markdown", "id": "ddc1086b", "metadata": { "papermill": { "duration": 0.003123, "end_time": "2023-09-06T01:53:50.538962", "exception": false, "start_time": "2023-09-06T01:53:50.535839", "status": "completed" }, "tags": [] }, "source": [ "## Enwiki Stage 1 : Foundation 4k model training" ] }, { "cell_type": "code", "execution_count": 5, "id": "636df5aa", "metadata": { "execution": { "iopub.execute_input": "2023-09-06T01:53:50.544775Z", "iopub.status.busy": "2023-09-06T01:53:50.544524Z", "iopub.status.idle": "2023-09-06T01:53:59.821234Z", "shell.execute_reply": "2023-09-06T01:53:59.820428Z" }, "papermill": { "duration": 9.281152, "end_time": "2023-09-06T01:53:59.823158", "exception": false, "start_time": "2023-09-06T01:53:50.542006", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r", "Saving the dataset (0/5 shards): 0%| | 0/81505 [00:00