sovits-new / so_vits_svc_fork /preprocessing /preprocess_resample.py
Vexa's picture
Duplicate from pivich/sovits-new
d5d7329
raw
history blame contribute delete
No virus
4.45 kB
from __future__ import annotations
import warnings
from logging import getLogger
from pathlib import Path
from typing import Iterable
import librosa
import soundfile
from joblib import Parallel, delayed
from tqdm_joblib import tqdm_joblib
from .preprocess_utils import check_hubert_min_duration
LOG = getLogger(__name__)
# input_dir and output_dir exists.
# write code to convert input dir audio files to output dir audio files,
# without changing folder structure. Use joblib to parallelize.
# Converting audio files includes:
# - resampling to specified sampling rate
# - trim silence
# - adjust volume in a smart way
# - save as 16-bit wav file
def _get_unique_filename(path: Path, existing_paths: Iterable[Path]) -> Path:
"""Return a unique path by appending a number to the original path."""
if path not in existing_paths:
return path
i = 1
while True:
new_path = path.parent / f"{path.stem}_{i}{path.suffix}"
if new_path not in existing_paths:
return new_path
i += 1
def is_relative_to(path: Path, *other):
"""Return True if the path is relative to another path or False.
Python 3.9+ has Path.is_relative_to() method, but we need to support Python 3.8.
"""
try:
path.relative_to(*other)
return True
except ValueError:
return False
def _preprocess_one(
input_path: Path,
output_path: Path,
sr: int,
*,
top_db: int,
frame_seconds: float,
hop_seconds: float,
) -> None:
"""Preprocess one audio file."""
try:
audio, sr = librosa.load(input_path, sr=sr, mono=True)
# Audioread is the last backend it will attempt, so this is the exception thrown on failure
except Exception as e:
# Failure due to attempting to load a file that is not audio, so return early
LOG.warning(f"Failed to load {input_path} due to {e}")
return
if not check_hubert_min_duration(audio, sr):
LOG.info(f"Skip {input_path} because it is too short.")
return
# Adjust volume
audio /= max(audio.max(), -audio.min())
# Trim silence
audio, _ = librosa.effects.trim(
audio,
top_db=top_db,
frame_length=int(frame_seconds * sr),
hop_length=int(hop_seconds * sr),
)
if not check_hubert_min_duration(audio, sr):
LOG.info(f"Skip {input_path} because it is too short.")
return
soundfile.write(output_path, audio, samplerate=sr, subtype="PCM_16")
def preprocess_resample(
input_dir: Path | str,
output_dir: Path | str,
sampling_rate: int,
n_jobs: int = -1,
*,
top_db: int = 30,
frame_seconds: float = 0.1,
hop_seconds: float = 0.05,
) -> None:
input_dir = Path(input_dir)
output_dir = Path(output_dir)
"""Preprocess audio files in input_dir and save them to output_dir."""
out_paths = []
in_paths = list(input_dir.rglob("*.*"))
if not in_paths:
raise ValueError(f"No audio files found in {input_dir}")
for in_path in in_paths:
in_path_relative = in_path.relative_to(input_dir)
if not in_path.is_absolute() and is_relative_to(
in_path, Path("dataset_raw") / "44k"
):
new_in_path_relative = in_path_relative.relative_to("44k")
warnings.warn(
f"Recommended folder structure has changed since v1.0.0. "
"Please move your dataset directly under dataset_raw folder. "
f"Recoginzed {in_path_relative} as {new_in_path_relative}"
)
in_path_relative = new_in_path_relative
if len(in_path_relative.parts) < 2:
continue
speaker_name = in_path_relative.parts[0]
file_name = in_path_relative.with_suffix(".wav").name
out_path = output_dir / speaker_name / file_name
out_path = _get_unique_filename(out_path, out_paths)
out_path.parent.mkdir(parents=True, exist_ok=True)
out_paths.append(out_path)
in_and_out_paths = list(zip(in_paths, out_paths))
with tqdm_joblib(desc="Preprocessing", total=len(in_and_out_paths)):
Parallel(n_jobs=n_jobs)(
delayed(_preprocess_one)(
*args,
sr=sampling_rate,
top_db=top_db,
frame_seconds=frame_seconds,
hop_seconds=hop_seconds,
)
for args in in_and_out_paths
)