From ff2e7392292a09436f31d1e12a5e7cb1aa6519dc Mon Sep 17 00:00:00 2001 From: zhzhongshi <39423408+zhzhongshi@users.noreply.github.com> Date: Fri, 15 Mar 2024 09:48:29 +0800 Subject: [PATCH] Add support for MDXC models (#50) * Add support for MDXC models * Updated poetry lockfile to match dependencies * fix err: CLI does not work * Fixed MDXC config YAML download, formatted mdxc separator class, bumped version ready for release * Added progress bar for file downloads * Added error handling for failed model load due to incomplete/corrupt download * Fixed outstanding issues with YAML config loading and file download, added todo list for integration tests to write * Moved load model into own method for consistency with mdxc class * Refactored MDXC class to use more descriptive variable names, removed dead code, added debug logging and clearer parameters etc. * Fixed and tested pitch shift logic for MDXC, added CLI params for other MDXC config parameters and tested these * Added MDXC to readme * Added thanks! --------- Co-authored-by: Andrew Beveridge --- README.md | 28 +- .../separator/architectures/__init__.py | 1 + .../separator/architectures/mdx_separator.py | 35 ++- .../separator/architectures/mdxc_separator.py | 257 ++++++++++++++++++ audio_separator/separator/separator.py | 112 ++++---- .../separator/uvr_lib_v5/__init__.py | 0 audio_separator/utils/cli.py | 157 ++++++----- poetry.lock | 68 ++++- pyproject.toml | 4 +- tests/TODO.txt | 9 + 10 files changed, 528 insertions(+), 143 deletions(-) create mode 100644 audio_separator/separator/architectures/mdxc_separator.py create mode 100644 audio_separator/separator/uvr_lib_v5/__init__.py create mode 100644 tests/TODO.txt diff --git a/README.md b/README.md index c999caa..acd8c0d 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![Docker pulls](https://img.shields.io/docker/pulls/beveradb/audio-separator.svg)](https://hub.docker.com/r/beveradb/audio-separator/tags) [![codecov](https://codecov.io/gh/karaokenerds/python-audio-separator/graph/badge.svg?token=N7YK4ET5JP)](https://codecov.io/gh/karaokenerds/python-audio-separator) -Summary: Easy to use audio stem separation from the command line or as a dependency in your own Python project, using the amazing MDX-Net and VR Arch models available in UVR by @Anjok07 & @aufr33. +Summary: Easy to use audio stem separation from the command line or as a dependency in your own Python project, using the amazing MDX-Net, VR Arch, Demucs and MDXC models available in UVR by @Anjok07 & @aufr33. Audio Separator is a Python package that allows you to separate an audio file into various stems, using models trained by @Anjok07 for use with UVR (https://github.com/Anjok07/ultimatevocalremovergui). @@ -136,8 +136,9 @@ Any file listed in the list models output can be specified (with file extension) usage: audio-separator [-h] [-v] [-d] [-e] [-l] [--log_level LOG_LEVEL] [-m MODEL_FILENAME] [--output_format OUTPUT_FORMAT] [--output_dir OUTPUT_DIR] [--model_file_dir MODEL_FILE_DIR] [--invert_spect] [--normalization NORMALIZATION] [--single_stem SINGLE_STEM] [--sample_rate SAMPLE_RATE] [--mdx_segment_size MDX_SEGMENT_SIZE] [--mdx_overlap MDX_OVERLAP] [--mdx_batch_size MDX_BATCH_SIZE] [--mdx_hop_length MDX_HOP_LENGTH] [--mdx_enable_denoise] [--vr_batch_size VR_BATCH_SIZE] [--vr_window_size VR_WINDOW_SIZE] [--vr_aggression VR_AGGRESSION] [--vr_enable_tta] - [--vr_high_end_process] [--vr_enable_post_process] [--vr_post_process_threshold VR_POST_PROCESS_THRESHOLD] [--demucs_stem DEMUCS_STEM] [--demucs_segment_size DEMUCS_SEGMENT_SIZE] - [--demucs_shifts DEMUCS_SHIFTS] [--demucs_overlap DEMUCS_OVERLAP] [--demucs_segments_enabled DEMUCS_SEGMENTS_ENABLED] + [--vr_high_end_process] [--vr_enable_post_process] [--vr_post_process_threshold VR_POST_PROCESS_THRESHOLD] [--demucs_segment_size DEMUCS_SEGMENT_SIZE] [--demucs_shifts DEMUCS_SHIFTS] + [--demucs_overlap DEMUCS_OVERLAP] [--demucs_segments_enabled DEMUCS_SEGMENTS_ENABLED] [--mdxc_segment_size MDXC_SEGMENT_SIZE] [--mdxc_use_model_segment_size] [--mdxc_overlap MDXC_OVERLAP] + [--mdxc_batch_size MDXC_BATCH_SIZE] [--mdxc_pitch_shift MDXC_PITCH_SHIFT] [audio_file] Separate audio file into different stems. @@ -149,11 +150,11 @@ options: -h, --help show this help message and exit Info and Debugging: - -v, --version show program's version number and exit - -d, --debug enable debug logging, equivalent to --log_level=debug - -e, --env_info print environment information and exit. - -l, --list_models list all supported models and exit. - --log_level LOG_LEVEL log level, e.g. info, debug, warning (default: info) + -v, --version Show the program's version number and exit. + -d, --debug Enable debug logging, equivalent to --log_level=debug. + -e, --env_info Print environment information and exit. + -l, --list_models List all supported models and exit. + --log_level LOG_LEVEL Log level, e.g. info, debug, warning (default: info). Separation I/O Params: -m MODEL_FILENAME, --model_filename MODEL_FILENAME model to use for separation (default: UVR-MDX-NET-Inst_HQ_3.onnx). Example: -m 2_HP-UVR.pth @@ -164,7 +165,7 @@ Separation I/O Params: Common Separation Parameters: --invert_spect invert secondary stem using spectogram (default: False). Example: --invert_spect --normalization NORMALIZATION max peak amplitude to normalize input and output audio to (default: 0.9). Example: --normalization=0.7 - --single_stem SINGLE_STEM output only single stem, either instrumental or vocals. Example: --single_stem=instrumental + --single_stem SINGLE_STEM output only single stem, e.g. Instrumental, Vocals, Drums, Bass, Guitar, Piano, Other. Example: --single_stem=Instrumental --sample_rate SAMPLE_RATE modify the sample rate of the output audio (default: 44100). Example: --sample_rate=44100 MDX Architecture Parameters: @@ -184,11 +185,17 @@ VR Architecture Parameters: --vr_post_process_threshold VR_POST_PROCESS_THRESHOLD threshold for post_process feature: 0.1-0.3 (default: 0.2). Example: --vr_post_process_threshold=0.1 Demucs Architecture Parameters: - --demucs_stem DEMUCS_STEM stem to extract from audio file, e.g. Vocals, Drums, Bass, Other (default: All Stems). Example: --demucs_stem=vocals --demucs_segment_size DEMUCS_SEGMENT_SIZE size of segments into which the audio is split, 1-100. higher = slower but better quality (default: Default). Example: --demucs_segment_size=256 --demucs_shifts DEMUCS_SHIFTS number of predictions with random shifts, higher = slower but better quality (default: 2). Example: --demucs_shifts=4 --demucs_overlap DEMUCS_OVERLAP overlap between prediction windows, 0.001-0.999. higher = slower but better quality (default: 0.25). Example: --demucs_overlap=0.25 --demucs_segments_enabled DEMUCS_SEGMENTS_ENABLED enable segment-wise processing (default: True). Example: --demucs_segments_enabled=False + +MDXC Architecture Parameters: + --mdxc_segment_size MDXC_SEGMENT_SIZE larger consumes more resources, but may give better results (default: 256). Example: --mdxc_segment_size=256 + --mdxc_use_model_segment_size use model default segment size instead of the value from the config file. Example: --mdxc_use_model_segment_size + --mdxc_overlap MDXC_OVERLAP amount of overlap between prediction windows, 2-50. higher is better but slower (default: 8). Example: --mdxc_overlap=8 + --mdxc_batch_size MDXC_BATCH_SIZE larger consumes more RAM but may process slightly faster (default: 1). Example: --mdxc_batch_size=4 + --mdxc_pitch_shift MDXC_PITCH_SHIFT shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals. (default: 0). Example: --mdxc_pitch_shift=2 ``` ### As a Dependency in a Python Project @@ -348,6 +355,7 @@ This project is licensed under the MIT [License](LICENSE). - [Kuielab & Woosung Choi](https://github.com/kuielab) - Developed the original MDX-Net AI code. - [KimberleyJSN](https://github.com/KimberleyJensen) - Advised and aided the implementation of the training scripts for MDX-Net and Demucs. Thank you! - [Hv](https://github.com/NaJeongMo/Colab-for-MDX_B) - Helped implement chunks into the MDX-Net AI code. Thank you! +- [zhzhongshi](https://github.com/zhzhongshi) - Helped add support for the MDXC models in `audio-separator`. Thank you! ## Contact 💌 diff --git a/audio_separator/separator/architectures/__init__.py b/audio_separator/separator/architectures/__init__.py index 9139bab..5d1de84 100644 --- a/audio_separator/separator/architectures/__init__.py +++ b/audio_separator/separator/architectures/__init__.py @@ -1,3 +1,4 @@ from .mdx_separator import MDXSeparator from .vr_separator import VRSeparator from .demucs_separator import DemucsSeparator +from .mdxc_separator import MDXCSeparator \ No newline at end of file diff --git a/audio_separator/separator/architectures/mdx_separator.py b/audio_separator/separator/architectures/mdx_separator.py index 4d2e687..e74ec48 100644 --- a/audio_separator/separator/architectures/mdx_separator.py +++ b/audio_separator/separator/architectures/mdx_separator.py @@ -90,8 +90,28 @@ def __init__(self, common_config, arch_config): # We haven't implemented support for the checkpoint models here, so we're not using it. # self.dim_c = 4 - # Loading the model for inference + self.load_model() + + self.n_bins = 0 + self.trim = 0 + self.chunk_size = 0 + self.gen_size = 0 + self.stft = None + + self.primary_source = None + self.secondary_source = None + self.audio_file_path = None + self.audio_file_base = None + self.secondary_source_map = None + self.primary_source_map = None + + def load_model(self): + """ + Load the model into memory from file on disk, initialize it with config from the model data, + and prepare for inferencing using hardware accelerated Torch device. + """ self.logger.debug("Loading ONNX model for inference...") + if self.segment_size == self.dim_t: ort_session_options = ort.SessionOptions() if self.log_level > 10: @@ -107,19 +127,6 @@ def __init__(self, common_config, arch_config): self.model_run.to(self.torch_device).eval() self.logger.warning("Model converted from onnx to pytorch due to segment size not matching dim_t, processing may be slower.") - self.n_bins = 0 - self.trim = 0 - self.chunk_size = 0 - self.gen_size = 0 - self.stft = None - - self.primary_source = None - self.secondary_source = None - self.audio_file_path = None - self.audio_file_base = None - self.secondary_source_map = None - self.primary_source_map = None - def separate(self, audio_file_path): """ Separates the audio file into primary and secondary sources based on the model's configuration. diff --git a/audio_separator/separator/architectures/mdxc_separator.py b/audio_separator/separator/architectures/mdxc_separator.py new file mode 100644 index 0000000..115b1fa --- /dev/null +++ b/audio_separator/separator/architectures/mdxc_separator.py @@ -0,0 +1,257 @@ +import os +import sys + +import torch +import numpy as np +from tqdm import tqdm +from ml_collections import ConfigDict + +from audio_separator.separator.common_separator import CommonSeparator +from audio_separator.separator.uvr_lib_v5.tfc_tdf_v3 import TFC_TDF_net +from audio_separator.separator.uvr_lib_v5 import spec_utils + + +class MDXCSeparator(CommonSeparator): + """ + MDXCSeparator is responsible for separating audio sources using MDXC models. + It initializes with configuration parameters and prepares the model for separation tasks. + """ + + def __init__(self, common_config, arch_config): + # Any configuration values which can be shared between architectures should be set already in CommonSeparator, + # e.g. user-specified functionality choices (self.output_single_stem) or common model parameters (self.primary_stem_name) + super().__init__(config=common_config) + + # Model data is basic overview metadata about the model, e.g. which stem is primary and whether it's a karaoke model + # It's loaded in from model_data_new.json in Separator.load_model and there are JSON examples in that method + # The instance variable self.model_data is passed through from Separator and set in CommonSeparator + self.logger.debug(f"Model data: {self.model_data}") + + # Arch Config is the MDXC architecture specific user configuration options, which should all be configurable by the user + # either by their Separator class instantiation or by passing in a CLI parameter. + # While there are similarities between architectures for some of these (e.g. batch_size), they are deliberately configured + # this way as they have architecture-specific default values. + self.segment_size = arch_config.get("segment_size", 256) + self.use_model_segment_size = arch_config.get("use_model_segment_size", False) + + self.overlap = arch_config.get("overlap", 8) + self.batch_size = arch_config.get("batch_size", 1) + + # Amount of pitch shift to apply during processing (this does NOT affect the pitch of the output audio): + # • Whole numbers indicate semitones. + # • Using higher pitches may cut the upper bandwidth, even in high-quality models. + # • Upping the pitch can be better for tracks with deeper vocals. + # • Dropping the pitch may take more processing time but works well for tracks with high-pitched vocals. + self.pitch_shift = arch_config.get("pitch_shift", 0) + + self.logger.debug(f"MDXC arch params: batch_size={self.batch_size}, segment_size={self.segment_size}, overlap={self.overlap}") + self.logger.debug(f"MDXC arch params: use_model_segment_size={self.use_model_segment_size}, pitch_shift={self.pitch_shift}") + + self.load_model() + + self.primary_source = None + self.secondary_source = None + self.audio_file_path = None + self.audio_file_base = None + self.primary_source_map = None + self.secondary_source_map = None + + self.logger.info("MDXC Separator initialisation complete") + + def load_model(self): + """ + Load the model into memory from file on disk, initialize it with config from the model data, + and prepare for inferencing using hardware accelerated Torch device. + """ + self.logger.debug("Loading checkpoint model for inference...") + + self.model_data_cfgdict = ConfigDict(self.model_data) + + try: + self.model_run = TFC_TDF_net(self.model_data_cfgdict, device=self.torch_device) + self.model_run.load_state_dict(torch.load(self.model_path, map_location="cpu")) + self.model_run.to(self.torch_device).eval() + except RuntimeError as e: + self.logger.error(f"Error: {e}") + self.logger.error("An error occurred while loading the model file. This often occurs when the model file is corrupt or incomplete.") + self.logger.error(f"Please try deleting the model file from {self.model_path} and run audio-separator again to re-download it.") + sys.exit(1) + + def separate(self, audio_file_path): + """ + Separates the audio file into primary and secondary sources based on the model's configuration. + It processes the mix, demixes it into sources, normalizes the sources, and saves the output files. + + Args: + audio_file_path (str): The path to the audio file to be processed. + + Returns: + list: A list of paths to the output files generated by the separation process. + """ + self.primary_source = None + self.secondary_source = None + + self.audio_file_path = audio_file_path + self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0] + + self.logger.debug("Preparing mix...") + mix = self.prepare_mix(self.audio_file_path) + + self.logger.debug("Normalizing mix before demixing...") + mix = spec_utils.normalize(wave=mix, max_peak=self.normalization_threshold) + + source = self.demix(mix=mix) + self.logger.debug("Demixing completed.") + + output_files = [] + self.logger.debug("Processing output files...") + + if not isinstance(self.primary_source, np.ndarray): + self.logger.debug("Normalizing primary source...") + self.primary_source = spec_utils.normalize(wave=source[self.primary_stem_name], max_peak=self.normalization_threshold).T + + if not isinstance(self.secondary_source, np.ndarray): + self.logger.debug("Normalizing secondary source...") + self.secondary_source = spec_utils.normalize(wave=source[self.secondary_stem_name], max_peak=self.normalization_threshold).T + + if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower(): + self.logger.info(f"Saving {self.secondary_stem_name} stem...") + if not self.secondary_stem_output_path: + self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}") + self.secondary_source_map = self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name) + output_files.append(self.secondary_stem_output_path) + + if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower(): + self.logger.info(f"Saving {self.primary_stem_name} stem...") + if not self.primary_stem_output_path: + self.primary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}") + if not isinstance(self.primary_source, np.ndarray): + self.primary_source = source.T + self.primary_source_map = self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name) + output_files.append(self.primary_stem_output_path) + return output_files + + def pitch_fix(self, source, sr_pitched, org_mix): + """ + Change the pitch of the source audio by a number of semitones. + + Args: + source (np.ndarray): The source audio to be pitch-shifted. + sr_pitched (int): The sample rate of the pitch-shifted audio. + org_mix (np.ndarray): The original mix, used to match the shape of the pitch-shifted audio. + + Returns: + np.ndarray: The pitch-shifted source audio. + """ + source = spec_utils.change_pitch_semitones(source, sr_pitched, semitone_shift=self.pitch_shift)[0] + source = spec_utils.match_array_shapes(source, org_mix) + return source + + def demix(self, mix: np.ndarray) -> dict: + """ + Demixes the input mix into primary and secondary sources using the model and model data. + + Args: + mix (np.ndarray): The mix to be demixed. + Returns: + dict: A dictionary containing the demixed sources. + """ + orig_mix = mix + + if self.pitch_shift != 0: + self.logger.debug(f"Shifting pitch by -{self.pitch_shift} semitones...") + mix, sample_rate = spec_utils.change_pitch_semitones(mix, self.sample_rate, semitone_shift=-self.pitch_shift) + + mix = torch.tensor(mix, dtype=torch.float32) + + try: + num_stems = self.model_run.num_target_instruments + except AttributeError: + num_stems = self.model_run.module.num_target_instruments + self.logger.debug(f"Number of stems: {num_stems}") + + if self.use_model_segment_size: + mdx_segment_size = self.model_data_cfgdict.inference.dim_t + self.logger.debug(f"Using model default segment size: {mdx_segment_size}") + else: + mdx_segment_size = self.segment_size + self.logger.debug(f"Using configured segment size: {mdx_segment_size}") + + chunk_size = self.model_data_cfgdict.audio.hop_length * (mdx_segment_size - 1) + self.logger.debug(f"Chunk size: {chunk_size}") + + hop_size = chunk_size // self.overlap + self.logger.debug(f"Hop size: {hop_size}") + + mix_shape = mix.shape[1] + pad_size = hop_size - (mix_shape - chunk_size) % hop_size + self.logger.debug(f"Pad size: {pad_size}") + + mix = torch.cat([torch.zeros(2, chunk_size - hop_size), mix, torch.zeros(2, pad_size + chunk_size - hop_size)], 1) + self.logger.debug(f"Mix shape: {mix.shape}") + + chunks = mix.unfold(1, chunk_size, hop_size).transpose(0, 1) + self.logger.debug(f"Chunks length: {len(chunks)} and shape: {chunks.shape}") + + batches = [chunks[i : i + self.batch_size] for i in range(0, len(chunks), self.batch_size)] + self.logger.debug(f"Batch size: {self.batch_size}, number of batches: {len(batches)}") + + # accumulated_outputs is used to accumulate the output from processing each batch of chunks through the model. + # It starts as a tensor of zeros and is updated in-place as the model processes each batch. + # The variable holds the combined result of all processed batches, which, after post-processing, represents the separated audio sources. + accumulated_outputs = torch.zeros(num_stems, *mix.shape) if num_stems > 1 else torch.zeros_like(mix) + accumulated_outputs = accumulated_outputs.to(self.torch_device) + + with torch.no_grad(): + count = 0 + for batch in tqdm(batches): + # Since the model processes the audio data in batches, single_batch_result temporarily holds the model's output + # for each batch before it is accumulated into accumulated_outputs. + single_batch_result = self.model_run(batch.to(self.torch_device)) + + # Each individual output tensor from the current batch's processing result. + # Since single_batch_result can contain multiple output tensors (one for each piece of audio in the batch), + # individual_output is used to iterate through these tensors and accumulate them into accumulated_outputs. + for individual_output in single_batch_result: + accumulated_outputs[..., count * hop_size : count * hop_size + chunk_size] += individual_output + count += 1 + + self.logger.debug("Calculating inferenced outputs based on accumulated outputs and overlap") + inferenced_outputs = accumulated_outputs[..., chunk_size - hop_size : -(pad_size + chunk_size - hop_size)] / self.overlap + self.logger.debug("Deleting accumulated outputs to free up memory") + del accumulated_outputs + + if num_stems > 1: + self.logger.debug("Number of stems is greater than 1, detaching individual sources and correcting pitch if necessary...") + + sources = {} + + # Iterates over each instrument specified in the model's configuration and its corresponding separated audio source. + # self.model_data_cfgdict.training.instruments provides the list of stems. + # estimated_sources.cpu().detach().numpy() converts the separated sources tensor to a NumPy array for processing. + # Each iteration provides an instrument name ('key') and its separated audio ('value') for further processing. + for key, value in zip(self.model_data_cfgdict.training.instruments, inferenced_outputs.cpu().detach().numpy()): + self.logger.debug(f"Processing instrument: {key}") + if self.pitch_shift != 0: + self.logger.debug(f"Applying pitch correction for {key}") + sources[key] = self.pitch_fix(value, sample_rate, orig_mix) + else: + sources[key] = value + + self.logger.debug("Deleting inferenced outputs to free up memory") + del inferenced_outputs + + self.logger.debug("Returning separated sources") + return sources + + self.logger.debug("Detaching inferenced output for single instrument scenario") + inferenced_output = inferenced_outputs.cpu().detach().numpy() + self.logger.debug("Deleting inferenced outputs to free up memory") + del inferenced_outputs + + if self.pitch_shift != 0: + self.logger.debug("Applying pitch correction for single instrument") + return self.pitch_fix(inferenced_output, sample_rate, orig_mix) + + self.logger.debug("Returning inferenced output for single instrument") + return inferenced_output diff --git a/audio_separator/separator/separator.py b/audio_separator/separator/separator.py index 2fe9345..13e1db5 100644 --- a/audio_separator/separator/separator.py +++ b/audio_separator/separator/separator.py @@ -4,18 +4,17 @@ import os import platform import subprocess +import time +import logging +import warnings import hashlib import json import yaml -import time -import logging -import warnings import requests import torch import onnxruntime as ort - -from audio_separator.separator.common_separator import CommonSeparator +from tqdm import tqdm class Separator: @@ -78,6 +77,7 @@ def __init__( mdx_params={"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1, "enable_denoise": False}, vr_params={"batch_size": 16, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False}, demucs_params={"segment_size": "Default", "shifts": 2, "overlap": 0.25, "segments_enabled": True}, + mdxc_params={"segment_size": 256, "batch_size": 1, "overlap": 8}, ): self.logger = logging.getLogger(__name__) self.logger.setLevel(log_level) @@ -133,7 +133,7 @@ def __init__( # These are parameters which users may want to configure so we expose them to the top-level Separator class, # even though they are specific to a single model architecture - self.arch_specific_params = {"MDX": mdx_params, "VR": vr_params, "Demucs": demucs_params} + self.arch_specific_params = {"MDX": mdx_params, "VR": vr_params, "Demucs": demucs_params, "MDXC": mdxc_params} self.torch_device = None self.torch_device_cpu = None @@ -293,9 +293,14 @@ def download_file_if_not_exists(self, url, output_path): response = requests.get(url, stream=True, timeout=300) if response.status_code == 200: + total_size_in_bytes = int(response.headers.get("content-length", 0)) + progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) + with open(output_path, "wb") as f: for chunk in response.iter_content(chunk_size=8192): + progress_bar.update(len(chunk)) f.write(chunk) + progress_bar.close() else: raise RuntimeError(f"Failed to download file from {url}, response code: {response.status_code}") @@ -358,8 +363,8 @@ def list_supported_model_files(self): "VR": model_downloads_list["vr_download_list"], "MDX": model_downloads_list["mdx_download_list"], "Demucs": filtered_demucs_v4, + "MDXC": model_downloads_list["mdx23c_download_list"], # "MDX23": model_downloads_list["mdx23_download_list"], - # "MDX23C": model_downloads_list["mdx23c_download_list"], } return model_files_grouped_by_type @@ -372,6 +377,8 @@ def download_model_files(self, model_filename): supported_model_files_grouped = self.list_supported_model_files() model_repo_url_prefix = "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models" + yaml_config_filename = None + self.logger.debug(f"Searching for model_filename {model_filename} in supported_model_files_grouped") for model_type, model_list in supported_model_files_grouped.items(): for model_friendly_name, model_download_list in model_list.items(): @@ -382,7 +389,7 @@ def download_model_files(self, model_filename): self.download_file_if_not_exists(f"{model_repo_url_prefix}/{model_filename}", model_path) self.logger.debug(f"Returning path for single model file: {model_path}") - return model_type, model_friendly_name, model_path + return model_type, model_friendly_name, model_path, yaml_config_filename # If it's a dict, iterate through each entry check if any of them match model_filename # If the value is a full URL, download it from that URL. @@ -397,24 +404,51 @@ def download_model_files(self, model_filename): if this_model_matches_input_filename: self.logger.debug(f"Multi-file model identified: {model_friendly_name}, iterating through files to download") - for file_name, file_url in model_download_list.items(): - # Demucs models have full URLs to download from Facebook repos - if file_url.startswith("http"): - full_file_url = file_url - # Checkpoint models apparently use the key as the filename, the value is a filename of a YAML config file... - elif file_name.endswith(".ckpt"): - full_file_url = f"{model_repo_url_prefix}/{file_name}" - # MDX and VR models have the model filename as the value mapped to the model name - else: - full_file_url = f"{model_repo_url_prefix}/{file_url}" + for config_key, config_value in model_download_list.items(): + self.logger.debug(f"Attempting to identify download URL for config pair: {config_key} -> {config_value}") - self.download_file_if_not_exists(full_file_url, os.path.join(self.model_file_dir, file_name)) + # Demucs models have full URLs to download from Facebook repos, and config_key is set to the file name + if config_value.startswith("http"): + self.download_file_if_not_exists(config_value, os.path.join(self.model_file_dir, config_key)) + + # Checkpoint models apparently use config_key as the model filename, but the value is a YAML config file name... + # Both need to be downloaded, but the model data YAML file actually comes from the application data repo... + elif config_key.endswith(".ckpt"): + download_url = f"{model_repo_url_prefix}/{config_key}" + self.download_file_if_not_exists(download_url, os.path.join(self.model_file_dir, config_key)) + + # For MDXC models, the config_value is the YAML file which needs to be downloaded separately from the application_data repo + yaml_config_filename = config_value + yaml_config_filepath = os.path.join(self.model_file_dir, yaml_config_filename) + + # Repo for model data and configuration sources from UVR + model_data_url_prefix = "https://raw.githubusercontent.com/TRvlvr/application_data/main" + yaml_config_url = f"{model_data_url_prefix}/mdx_model_data/mdx_c_configs/{yaml_config_filename}" + + self.download_file_if_not_exists(f"{yaml_config_url}", yaml_config_filepath) + + # MDX and VR models have config_value set to the model filename + else: + download_url = f"{model_repo_url_prefix}/{config_value}" + self.download_file_if_not_exists(download_url, os.path.join(self.model_file_dir, config_value)) self.logger.debug(f"All files downloaded for model {model_friendly_name}, returning initial path {model_path}") - return model_type, model_friendly_name, model_path + return model_type, model_friendly_name, model_path, yaml_config_filename raise ValueError(f"Model file {model_filename} not found in supported model files") + def load_model_data_from_yaml(self, yaml_config_filename): + """ + This method loads model-specific parameters from the YAML file for that model. + The parameters in the YAML are critical to inferencing, as they need to match whatever was used during training. + """ + model_data_yaml_filepath = os.path.join(self.model_file_dir, yaml_config_filename) + self.logger.debug(f"Loading model data from YAML at path {model_data_yaml_filepath}") + + model_data = yaml.load(open(model_data_yaml_filepath, encoding="utf-8"), Loader=yaml.FullLoader) + self.logger.debug(f"Model data loaded from YAML file: {model_data}") + return model_data + def load_model_data_using_hash(self, model_path): """ This method loads model-specific parameters from UVR model data files. @@ -426,7 +460,6 @@ def load_model_data_using_hash(self, model_path): vr_model_data_url = f"{model_data_url_prefix}/vr_model_data/model_data_new.json" mdx_model_data_url = f"{model_data_url_prefix}/mdx_model_data/model_data_new.json" - mdx_c_configs_url_prefix = f"{model_data_url_prefix}/mdx_model_data/mdx_c_configs" # Calculate hash for the downloaded model self.logger.debug("Calculating MD5 hash for model file to identify model parameters from UVR data...") @@ -527,35 +560,16 @@ def load_model_data_using_hash(self, model_path): elif model_hash in vr_model_data_object: model_data = vr_model_data_object[model_hash] else: - raise ValueError(f"Unsupported Model File: parameters for MD5 hash {model_hash} could not be found in the UVR model data file.") + raise ValueError(f"Unsupported Model File: parameters for MD5 hash {model_hash} could not be found in UVR model data file for MDX or VR arch.") self.logger.debug(f"Model data loaded from UVR JSON using hash {model_hash}: {model_data}") - # MDX_C model data is actually stored in a YAML file which has to be downloaded separately - if "config_yaml" in model_data: - self.logger.debug(f"Model data had config_yaml key, fetching actual model data YAML from MDX_C config URL") - - mdxc_model_data_path = os.path.join(self.model_file_dir, model_data["config_yaml"]) - self.logger.debug(f"MDX_C model data path set to {mdxc_model_data_path}") - self.download_file_if_not_exists(f"{mdx_c_configs_url_prefix}/{model_data['config_yaml']}", mdxc_model_data_path) - - model_data = self.load_model_data_from_yaml(mdxc_model_data_path) - - return model_data - - def load_model_data_from_yaml(self, model_path): - """ - This method loads model-specific parameters from the YAML file for that model. - The parameters in the YAML are critical to inferencing, as they need to match whatever was used during training. - """ - model_data = yaml.load(open(model_path, encoding="utf-8"), Loader=yaml.FullLoader) - - self.logger.debug(f"Model data loaded from YAML file: {model_data}") return model_data def load_model(self, model_filename="UVR-MDX-NET-Inst_HQ_3.onnx"): """ - This method loads the separation model into memory, downloading it first if necessary. + This method instantiates the architecture-specific separation class, + loading the separation model into memory, downloading it first if necessary. """ self.logger.info(f"Loading model {model_filename}...") @@ -563,11 +577,14 @@ def load_model(self, model_filename="UVR-MDX-NET-Inst_HQ_3.onnx"): # Setting up the model path model_name = model_filename.split(".")[0] - model_type, model_friendly_name, model_path = self.download_model_files(model_filename) - self.logger.debug(f"Model downloaded, friendly name: {model_friendly_name}") + model_type, model_friendly_name, model_path, yaml_config_filename = self.download_model_files(model_filename) + self.logger.debug(f"Model downloaded, friendly name: {model_friendly_name}, model_path: {model_path}") if model_path.lower().endswith(".yaml"): - model_data = self.load_model_data_from_yaml(model_path) + yaml_config_filename = model_path + + if yaml_config_filename is not None: + model_data = self.load_model_data_from_yaml(yaml_config_filename) else: model_data = self.load_model_data_using_hash(model_path) @@ -595,12 +612,13 @@ def load_model(self, model_filename="UVR-MDX-NET-Inst_HQ_3.onnx"): raise ValueError(f"Model type not supported (yet): {model_type}") # Instantiate the appropriate separator class depending on the model type - separator_classes = {"MDX": "MDXSeparator", "VR": "VRSeparator", "Demucs": "DemucsSeparator"} + separator_classes = {"MDX": "MDXSeparator", "VR": "VRSeparator", "Demucs": "DemucsSeparator", "MDXC": "MDXCSeparator"} if model_type not in separator_classes: raise ValueError(f"Model type not supported (yet): {model_type}") module = __import__("audio_separator.separator.architectures", fromlist=[separator_classes[model_type]]) + separator_class = getattr(module, separator_classes[model_type]) self.model_instance = separator_class(common_config=common_params, arch_config=self.arch_specific_params[model_type]) diff --git a/audio_separator/separator/uvr_lib_v5/__init__.py b/audio_separator/separator/uvr_lib_v5/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/audio_separator/utils/cli.py b/audio_separator/utils/cli.py index 3600e8e..23afd17 100755 --- a/audio_separator/utils/cli.py +++ b/audio_separator/utils/cli.py @@ -2,8 +2,11 @@ import argparse import logging import json +import sys from importlib import metadata +from audio_separator.separator import Separator + def main(): """Main entry point for the CLI.""" @@ -20,69 +23,96 @@ def main(): package_version = metadata.distribution("audio-separator").version + version_help = "Show the program's version number and exit." + debug_help = "Enable debug logging, equivalent to --log_level=debug." + env_info_help = "Print environment information and exit." + list_models_help = "List all supported models and exit." + log_level_help = "Log level, e.g. info, debug, warning (default: %(default)s)." + info_params = parser.add_argument_group("Info and Debugging") - info_params.add_argument("-v", "--version", action="version", version=f"%(prog)s {package_version}") - info_params.add_argument("-d", "--debug", action="store_true", help="enable debug logging, equivalent to --log_level=debug") - info_params.add_argument("-e", "--env_info", action="store_true", help="print environment information and exit.") - info_params.add_argument("-l", "--list_models", action="store_true", help="list all supported models and exit.") - info_params.add_argument("--log_level", default="info", help="log level, e.g. info, debug, warning (default: %(default)s)") + info_params.add_argument("-v", "--version", action="version", version=f"%(prog)s {package_version}", help=version_help) + info_params.add_argument("-d", "--debug", action="store_true", help=debug_help) + info_params.add_argument("-e", "--env_info", action="store_true", help=env_info_help) + info_params.add_argument("-l", "--list_models", action="store_true", help=list_models_help) + info_params.add_argument("--log_level", default="info", help=log_level_help) + + model_filename_help = "model to use for separation (default: %(default)s). Example: -m 2_HP-UVR.pth" + output_format_help = "output format for separated files, any common format (default: %(default)s). Example: --output_format=MP3" + output_dir_help = "directory to write output files (default: ). Example: --output_dir=/app/separated" + model_file_dir_help = "model files directory (default: %(default)s). Example: --model_file_dir=/app/models" io_params = parser.add_argument_group("Separation I/O Params") - io_params.add_argument("-m", "--model_filename", default="UVR-MDX-NET-Inst_HQ_3.onnx", help="model to use for separation (default: %(default)s). Example: -m 2_HP-UVR.pth") - io_params.add_argument("--output_format", default="FLAC", help="output format for separated files, any common format (default: %(default)s). Example: --output_format=MP3") - io_params.add_argument("--output_dir", default=None, help="directory to write output files (default: ). Example: --output_dir=/app/separated") - io_params.add_argument("--model_file_dir", default="/tmp/audio-separator-models/", help="model files directory (default: %(default)s). Example: --model_file_dir=/app/models") + io_params.add_argument("-m", "--model_filename", default="UVR-MDX-NET-Inst_HQ_3.onnx", help=model_filename_help) + io_params.add_argument("--output_format", default="FLAC", help=output_format_help) + io_params.add_argument("--output_dir", default=None, help=output_dir_help) + io_params.add_argument("--model_file_dir", default="/tmp/audio-separator-models/", help=model_file_dir_help) + + invert_spect_help = "invert secondary stem using spectogram (default: %(default)s). Example: --invert_spect" + normalization_help = "max peak amplitude to normalize input and output audio to (default: %(default)s). Example: --normalization=0.7" + single_stem_help = "output only single stem, e.g. Instrumental, Vocals, Drums, Bass, Guitar, Piano, Other. Example: --single_stem=Instrumental" + sample_rate_help = "modify the sample rate of the output audio (default: %(default)s). Example: --sample_rate=44100" common_params = parser.add_argument_group("Common Separation Parameters") - common_params.add_argument("--invert_spect", action="store_true", help="invert secondary stem using spectogram (default: %(default)s). Example: --invert_spect") - common_params.add_argument("--normalization", type=float, default=0.9, help="max peak amplitude to normalize input and output audio to (default: %(default)s). Example: --normalization=0.7") - common_params.add_argument("--single_stem", default=None, help="output only single stem, e.g. Instrumental, Vocals, Drums, Bass, Guitar, Piano, Other. Example: --single_stem=Instrumental") - common_params.add_argument("--sample_rate", type=int, default=44100, help="modify the sample rate of the output audio (default: %(default)s). Example: --sample_rate=44100") + common_params.add_argument("--invert_spect", action="store_true", help=invert_spect_help) + common_params.add_argument("--normalization", type=float, default=0.9, help=normalization_help) + common_params.add_argument("--single_stem", default=None, help=single_stem_help) + common_params.add_argument("--sample_rate", type=int, default=44100, help=sample_rate_help) + + mdx_segment_size_help = "larger consumes more resources, but may give better results (default: %(default)s). Example: --mdx_segment_size=256" + mdx_overlap_help = "amount of overlap between prediction windows, 0.001-0.999. higher is better but slower (default: %(default)s). Example: --mdx_overlap=0.25" + mdx_batch_size_help = "larger consumes more RAM but may process slightly faster (default: %(default)s). Example: --mdx_batch_size=4" + mdx_hop_length_help = "usually called stride in neural networks, only change if you know what you're doing (default: %(default)s). Example: --mdx_hop_length=1024" + mdx_enable_denoise_help = "enable denoising during separation (default: %(default)s). Example: --mdx_enable_denoise" mdx_params = parser.add_argument_group("MDX Architecture Parameters") - mdx_params.add_argument("--mdx_segment_size", type=int, default=256, help="larger consumes more resources, but may give better results (default: %(default)s). Example: --mdx_segment_size=256") - mdx_params.add_argument( - "--mdx_overlap", type=float, default=0.25, help="amount of overlap between prediction windows, 0.001-0.999. higher is better but slower (default: %(default)s). Example: --mdx_overlap=0.25" - ) - mdx_params.add_argument("--mdx_batch_size", type=int, default=1, help="larger consumes more RAM but may process slightly faster (default: %(default)s). Example: --mdx_batch_size=4") - mdx_params.add_argument( - "--mdx_hop_length", type=int, default=1024, help="usually called stride in neural networks, only change if you know what you're doing (default: %(default)s). Example: --mdx_hop_length=1024" - ) - mdx_params.add_argument("--mdx_enable_denoise", action="store_true", help="enable denoising during separation (default: %(default)s). Example: --mdx_enable_denoise") + mdx_params.add_argument("--mdx_segment_size", type=int, default=256, help=mdx_segment_size_help) + mdx_params.add_argument("--mdx_overlap", type=float, default=0.25, help=mdx_overlap_help) + mdx_params.add_argument("--mdx_batch_size", type=int, default=1, help=mdx_batch_size_help) + mdx_params.add_argument("--mdx_hop_length", type=int, default=1024, help=mdx_hop_length_help) + mdx_params.add_argument("--mdx_enable_denoise", action="store_true", help=mdx_enable_denoise_help) + + vr_batch_size_help = "number of batches to process at a time. higher = more RAM, slightly faster processing (default: %(default)s). Example: --vr_batch_size=16" + vr_window_size_help = "balance quality and speed. 1024 = fast but lower, 320 = slower but better quality. (default: %(default)s). Example: --vr_window_size=320" + vr_aggression_help = "intensity of primary stem extraction, -100 - 100. typically 5 for vocals & instrumentals (default: %(default)s). Example: --vr_aggression=2" + vr_enable_tta_help = "enable Test-Time-Augmentation; slow but improves quality (default: %(default)s). Example: --vr_enable_tta" + vr_high_end_process_help = "mirror the missing frequency range of the output (default: %(default)s). Example: --vr_high_end_process" + vr_enable_post_process_help = "identify leftover artifacts within vocal output; may improve separation for some songs (default: %(default)s). Example: --vr_enable_post_process" + vr_post_process_threshold_help = "threshold for post_process feature: 0.1-0.3 (default: %(default)s). Example: --vr_post_process_threshold=0.1" vr_params = parser.add_argument_group("VR Architecture Parameters") - vr_params.add_argument( - "--vr_batch_size", type=int, default=4, help="number of batches to process at a time. higher = more RAM, slightly faster processing (default: %(default)s). Example: --vr_batch_size=16" - ) - vr_params.add_argument( - "--vr_window_size", type=int, default=512, help="balance quality and speed. 1024 = fast but lower, 320 = slower but better quality. (default: %(default)s). Example: --vr_window_size=320" - ) - vr_params.add_argument( - "--vr_aggression", type=int, default=5, help="intensity of primary stem extraction, -100 - 100. typically 5 for vocals & instrumentals (default: %(default)s). Example: --vr_aggression=2" - ) - vr_params.add_argument("--vr_enable_tta", action="store_true", help="enable Test-Time-Augmentation; slow but improves quality (default: %(default)s). Example: --vr_enable_tta") - vr_params.add_argument("--vr_high_end_process", action="store_true", help="mirror the missing frequency range of the output (default: %(default)s). Example: --vr_high_end_process") - vr_params.add_argument( - "--vr_enable_post_process", - action="store_true", - help="identify leftover artifacts within vocal output; may improve separation for some songs (default: %(default)s). Example: --vr_enable_post_process", - ) - vr_params.add_argument("--vr_post_process_threshold", type=float, default=0.2, help="threshold for post_process feature: 0.1-0.3 (default: %(default)s). Example: --vr_post_process_threshold=0.1") + vr_params.add_argument("--vr_batch_size", type=int, default=4, help=vr_batch_size_help) + vr_params.add_argument("--vr_window_size", type=int, default=512, help=vr_window_size_help) + vr_params.add_argument("--vr_aggression", type=int, default=5, help=vr_aggression_help) + vr_params.add_argument("--vr_enable_tta", action="store_true", help=vr_enable_tta_help) + vr_params.add_argument("--vr_high_end_process", action="store_true", help=vr_high_end_process_help) + vr_params.add_argument("--vr_enable_post_process", action="store_true", help=vr_enable_post_process_help) + vr_params.add_argument("--vr_post_process_threshold", type=float, default=0.2, help=vr_post_process_threshold_help) + + demucs_segment_size_help = "size of segments into which the audio is split, 1-100. higher = slower but better quality (default: %(default)s). Example: --demucs_segment_size=256" + demucs_shifts_help = "number of predictions with random shifts, higher = slower but better quality (default: %(default)s). Example: --demucs_shifts=4" + demucs_overlap_help = "overlap between prediction windows, 0.001-0.999. higher = slower but better quality (default: %(default)s). Example: --demucs_overlap=0.25" + demucs_segments_enabled_help = "enable segment-wise processing (default: %(default)s). Example: --demucs_segments_enabled=False" demucs_params = parser.add_argument_group("Demucs Architecture Parameters") - demucs_params.add_argument( - "--demucs_segment_size", - type=str, - default="Default", - help="size of segments into which the audio is split, 1-100. higher = slower but better quality (default: %(default)s). Example: --demucs_segment_size=256", - ) - demucs_params.add_argument( - "--demucs_shifts", type=int, default=2, help="number of predictions with random shifts, higher = slower but better quality (default: %(default)s). Example: --demucs_shifts=4" - ) - demucs_params.add_argument( - "--demucs_overlap", type=float, default=0.25, help="overlap between prediction windows, 0.001-0.999. higher = slower but better quality (default: %(default)s). Example: --demucs_overlap=0.25" + demucs_params.add_argument("--demucs_segment_size", type=str, default="Default", help=demucs_segment_size_help) + demucs_params.add_argument("--demucs_shifts", type=int, default=2, help=demucs_shifts_help) + demucs_params.add_argument("--demucs_overlap", type=float, default=0.25, help=demucs_overlap_help) + demucs_params.add_argument("--demucs_segments_enabled", type=bool, default=True, help=demucs_segments_enabled_help) + + mdxc_segment_size_help = "larger consumes more resources, but may give better results (default: %(default)s). Example: --mdxc_segment_size=256" + mdxc_use_model_segment_size_help = "use model default segment size instead of the value from the config file. Example: --mdxc_use_model_segment_size" + mdxc_overlap_help = "amount of overlap between prediction windows, 2-50. higher is better but slower (default: %(default)s). Example: --mdxc_overlap=8" + mdxc_batch_size_help = "larger consumes more RAM but may process slightly faster (default: %(default)s). Example: --mdxc_batch_size=4" + mdxc_pitch_shift_help = ( + "shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals. (default: %(default)s). Example: --mdxc_pitch_shift=2" ) - demucs_params.add_argument("--demucs_segments_enabled", type=bool, default=True, help="enable segment-wise processing (default: %(default)s). Example: --demucs_segments_enabled=False") + + mdxc_params = parser.add_argument_group("MDXC Architecture Parameters") + mdxc_params.add_argument("--mdxc_segment_size", type=int, default=256, help=mdxc_segment_size_help) + mdxc_params.add_argument("--mdxc_use_model_segment_size", action="store_true", help=mdxc_use_model_segment_size_help) + mdxc_params.add_argument("--mdxc_overlap", type=int, default=8, help=mdxc_overlap_help) + mdxc_params.add_argument("--mdxc_batch_size", type=int, default=1, help=mdxc_batch_size_help) + mdxc_params.add_argument("--mdxc_pitch_shift", type=int, default=0, help=mdxc_pitch_shift_help) args = parser.parse_args() @@ -94,27 +124,20 @@ def main(): logger.setLevel(log_level) if args.env_info: - from audio_separator.separator import Separator - separator = Separator() - exit(0) + sys.exit(0) if args.list_models: - from audio_separator.separator import Separator - separator = Separator() print(json.dumps(separator.list_supported_model_files(), indent=4, sort_keys=True)) - exit(0) + sys.exit(0) if not hasattr(args, "audio_file"): parser.print_help() - exit(1) + sys.exit(1) logger.info(f"Separator version {package_version} beginning with input file: {args.audio_file}") - # Deliberately import here to avoid loading slow dependencies when just running --help - from audio_separator.separator import Separator - separator = Separator( log_formatter=log_formatter, log_level=log_level, @@ -141,11 +164,13 @@ def main(): "post_process_threshold": args.vr_post_process_threshold, "high_end_process": args.vr_high_end_process, }, - demucs_params={ - "segment_size": args.demucs_segment_size, - "shifts": args.demucs_shifts, - "overlap": args.demucs_overlap, - "segments_enabled": args.demucs_segments_enabled, + demucs_params={"segment_size": args.demucs_segment_size, "shifts": args.demucs_shifts, "overlap": args.demucs_overlap, "segments_enabled": args.demucs_segments_enabled}, + mdxc_params={ + "segment_size": args.mdxc_segment_size, + "batch_size": args.mdxc_batch_size, + "overlap": args.mdxc_overlap, + "use_model_segment_size": args.mdxc_use_model_segment_size, + "pitch_shift": args.mdxc_pitch_shift, }, ) diff --git a/poetry.lock b/poetry.lock index dd0aec5..89ae82a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,15 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. + +[[package]] +name = "absl-py" +version = "2.1.0" +description = "Abseil Python Common Libraries, see https://github.com/abseil/abseil-py." +optional = false +python-versions = ">=3.7" +files = [ + {file = "absl-py-2.1.0.tar.gz", hash = "sha256:7820790efbb316739cde8b4e19357243fc3608a152024288513dd968d7d959ff"}, + {file = "absl_py-2.1.0-py3-none-any.whl", hash = "sha256:526a04eadab8b4ee719ce68f204172ead1027549089702d99b9059f129ff1308"}, +] [[package]] name = "audioread" @@ -276,6 +287,17 @@ humanfriendly = ">=9.1" [package.extras] cron = ["capturer (>=2.4)"] +[[package]] +name = "contextlib2" +version = "21.6.0" +description = "Backports and enhancements for the contextlib module" +optional = false +python-versions = ">=3.6" +files = [ + {file = "contextlib2-21.6.0-py2.py3-none-any.whl", hash = "sha256:3fbdb64466afd23abaf6c977627b75b6139a5a3e8ce38405c5b413aed7a0471f"}, + {file = "contextlib2-21.6.0.tar.gz", hash = "sha256:ab1e2bfe1d01d968e1b7e8d9023bc51ef3509bba217bb730cee3827e1ee82869"}, +] + [[package]] name = "coverage" version = "7.4.3" @@ -777,6 +799,22 @@ files = [ {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"}, ] +[[package]] +name = "ml-collections" +version = "0.1.1" +description = "ML Collections is a library of Python collections designed for ML usecases." +optional = false +python-versions = ">=2.6" +files = [ + {file = "ml_collections-0.1.1.tar.gz", hash = "sha256:3fefcc72ec433aa1e5d32307a3e474bbb67f405be814ea52a2166bfc9dbe68cc"}, +] + +[package.dependencies] +absl-py = "*" +contextlib2 = "*" +PyYAML = "*" +six = "*" + [[package]] name = "mpmath" version = "1.3.0" @@ -1241,13 +1279,13 @@ sympy = "*" [[package]] name = "packaging" -version = "23.2" +version = "24.0" description = "Core utilities for Python packages" optional = false python-versions = ">=3.7" files = [ - {file = "packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"}, - {file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"}, + {file = "packaging-24.0-py3-none-any.whl", hash = "sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5"}, + {file = "packaging-24.0.tar.gz", hash = "sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9"}, ] [[package]] @@ -1571,6 +1609,26 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "resampy" +version = "0.4.3" +description = "Efficient signal resampling" +optional = false +python-versions = "*" +files = [ + {file = "resampy-0.4.3-py3-none-any.whl", hash = "sha256:ad2ed64516b140a122d96704e32bc0f92b23f45419e8b8f478e5a05f83edcebd"}, + {file = "resampy-0.4.3.tar.gz", hash = "sha256:a0d1c28398f0e55994b739650afef4e3974115edbe96cd4bb81968425e916e47"}, +] + +[package.dependencies] +numba = ">=0.53" +numpy = ">=1.17" + +[package.extras] +design = ["optuna (>=2.10.0)"] +docs = ["numpydoc", "sphinx (!=1.3.1)"] +tests = ["pytest (<8)", "pytest-cov", "scipy (>=1.1)"] + [[package]] name = "samplerate" version = "0.1.0" @@ -1964,4 +2022,4 @@ gpu = ["onnxruntime-gpu"] [metadata] lock-version = "2.0" python-versions = ">=3.9" -content-hash = "a82cdc2ee334a15ec0e67d681be9dece0842917834e900252a00acca120ecbcf" +content-hash = "60846f7b9c5d3909a982b949406ea9b9953ee132e3b77c20710acac20f503ebb" diff --git a/pyproject.toml b/pyproject.toml index 9581b93..6db1478 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "audio-separator" -version = "0.15.3" +version = "0.16.0" description = "Easy to use audio stem separation, using various models from UVR trained primarily by @Anjok07" authors = ["Andrew Beveridge "] license = "MIT" @@ -45,6 +45,8 @@ julius = ">=0.2" diffq = ">=0.2" einops = ">=0.7" pyyaml = "*" +ml_collections = "*" +resampy = ">=0.4" [tool.poetry.extras] cpu = ["onnxruntime"] diff --git a/tests/TODO.txt b/tests/TODO.txt new file mode 100644 index 0000000..6375b48 --- /dev/null +++ b/tests/TODO.txt @@ -0,0 +1,9 @@ +- Test running CLI with minimal input file for each major supported model (e.g. at least 1 from each architecture) outputs expected files +- Test running CLI with pre-warmed cache directory containing model files does not repeat download +- Test running CLI with corrupt model files in cache directory throws expected error +- Test loading the separation class rather than using the CLI works as expected with all major supported models +- Test processing multiple files in a row outputs separate expected files +- Test processing file with multiple different models outputs separate expected files +- Test each of the architecure specific parameters works as expected in both CLI and class mode +- Generate oscillogram and spectrogram of model output for a short test file for each major supported model and compare to expected output to ensure separation is actually separating stems +- Add a few different test files with different properties, e.g. background noise, stems present, or genre of music and ensure separation works as expected for each \ No newline at end of file