{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Change audio by detecting onset \n", "This notebook contains a method that could change the target video sound with a given audio." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load packages" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [], "source": [ "import IPython\n", "import os\n", "import numpy as np\n", "from moviepy.editor import *\n", "import librosa\n", "from IPython.display import Audio\n", "from IPython.display import Video" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "# Read videos\n", "origin_video_path = 'data/target.mp4'\n", "conditional_video_path = 'data/conditional.mp4'\n", "# conditional_video_path = 'data/dog_bark.mp4'\n", "\n", "ori_videoclip = VideoFileClip(origin_video_path)\n", "con_videoclip = VideoFileClip(conditional_video_path)\n" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 120, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Video(origin_video_path, width=640)" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Video(conditional_video_path, width=640)" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [], "source": [ "# get the audio track from video\n", "ori_audioclip = ori_videoclip.audio\n", "ori_audio, ori_sr = ori_audioclip.to_soundarray(), ori_audioclip.fps\n", "con_audioclip = con_videoclip.audio\n", "con_audio, con_sr = con_audioclip.to_soundarray(), con_audioclip.fps\n", "\n", "ori_audio = ori_audio.mean(-1)\n", "con_audio = con_audio.mean(-1)\n", "\n", "target_sr = 22050\n", "ori_audio = librosa.resample(ori_audio, orig_sr=ori_sr, target_sr=target_sr)\n", "con_audio = librosa.resample(con_audio, orig_sr=con_sr, target_sr=target_sr)\n", "\n", "ori_sr, con_sr = target_sr, target_sr" ] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [], "source": [ "def detect_onset_of_audio(audio, sample_rate):\n", " onsets = librosa.onset.onset_detect(\n", " y=audio, sr=sample_rate, units='samples', delta=0.3)\n", " return onsets\n" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "from matplotlib import pyplot as plt\n", "onsets = detect_onset_of_audio(ori_audio, ori_sr)\n", "plt.figure(dpi=100)\n", "\n", "time = np.arange(ori_audio.shape[0])\n", "plt.plot(time, ori_audio)\n", "plt.vlines(onsets, 0, ymax=0.5, colors='r')\n", "plt.show()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Method\n", "The baseline is quite simple, and it has several steps:\n", "- Take the original waveform (encoded and decoded by our codebook) and detect the onsets to determine the timestamp of sound events\n", "- (Optional) Assume we don't have original waveform, we can use Andrew's great hit model to predict sound from frames and detect onsets from it.\n", "- Detect onsets of conditional waveform (encoded and decoded by our codebook) and clip single onset event from them as sound candicates\n", "- For each onset of original waveform, replace with conditional onset event randomly and then generate sound" ] }, { "cell_type": "code", "execution_count": 125, "metadata": {}, "outputs": [], "source": [ "def get_onset_audio_range(audio, onsets, i):\n", " if i == 0:\n", " prev_offset = int(onsets[i] // 3)\n", " else:\n", " prev_offset = int((onsets[i] - onsets[i - 1]) // 3)\n", "\n", " if i == onsets.shape[0] - 1:\n", " post_offset = int((audio.shape[0] - onsets[i]) // 4 * 2)\n", " else:\n", " post_offset = int((onsets[i + 1] - onsets[i]) // 4 * 2)\n", " return prev_offset, post_offset\n" ] }, { "cell_type": "code", "execution_count": 126, "metadata": {}, "outputs": [], "source": [ "ori_onsets = detect_onset_of_audio(ori_audio, ori_sr)\n", "con_onsets = detect_onset_of_audio(con_audio, con_sr)\n", "\n", "np.random.seed(2022)\n", "gen_audio = np.zeros_like(ori_audio)\n", "for i in range(ori_onsets.shape[0]):\n", " prev_offset, post_offset = get_onset_audio_range(ori_audio, ori_onsets, i)\n", " j = np.random.choice(con_onsets.shape[0])\n", " prev_offset_con, post_offset_con = get_onset_audio_range(con_audio, con_onsets, j)\n", " prev_offset = min(prev_offset, prev_offset_con)\n", " post_offset = min(post_offset, post_offset_con)\n", " gen_audio[ori_onsets[i] - prev_offset: ori_onsets[i] + post_offset] = con_audio[con_onsets[j] - prev_offset: con_onsets[j] + post_offset]\n", "\n" ] }, { "cell_type": "code", "execution_count": 127, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "from matplotlib import pyplot as plt\n", "plt.figure(dpi=100)\n", "time = np.arange(gen_audio.shape[0])\n", "plt.plot(time, gen_audio)\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": 128, "metadata": {}, "outputs": [], "source": [ "# save audio\n", "import soundfile as sf\n", "sf.write('data/gen_audio.wav', gen_audio, ori_sr)\n" ] }, { "cell_type": "code", "execution_count": 129, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "t: 0%| | 0/49 [00:00\n", " Your browser does not support the video element.\n", " " ], "text/plain": [ "" ] }, "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Video('data/generate.mp4', width=640)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "ce61937b7f7dfb4402f1892711bcd3e4a6b6f6d238d7280e2db39bcb9fe9525c" }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 2 }