From ff2e7392292a09436f31d1e12a5e7cb1aa6519dc Mon Sep 17 00:00:00 2001
From: zhzhongshi <39423408+zhzhongshi@users.noreply.github.com>
Date: Fri, 15 Mar 2024 09:48:29 +0800
Subject: [PATCH] Add support for MDXC models (#50)

* Add support for MDXC models

* Updated poetry lockfile to match dependencies

* fix err: CLI does not work

* Fixed MDXC config YAML download, formatted mdxc separator class, bumped version ready for release

* Added progress bar for file downloads

* Added error handling for failed model load due to incomplete/corrupt download

* Fixed outstanding issues with YAML config loading and file download, added todo list for integration tests to write

* Moved load model into own method for consistency with mdxc class

* Refactored MDXC class to use more descriptive variable names, removed dead code, added debug logging and clearer parameters etc.

* Fixed and tested pitch shift logic for MDXC, added CLI params for other MDXC config parameters and tested these

* Added MDXC to readme

* Added thanks!

---------

Co-authored-by: Andrew Beveridge <andrew@beveridge.uk>
---
 README.md                                     |  28 +-
 .../separator/architectures/__init__.py       |   1 +
 .../separator/architectures/mdx_separator.py  |  35 ++-
 .../separator/architectures/mdxc_separator.py | 257 ++++++++++++++++++
 audio_separator/separator/separator.py        | 112 ++++----
 .../separator/uvr_lib_v5/__init__.py          |   0
 audio_separator/utils/cli.py                  | 157 ++++++-----
 poetry.lock                                   |  68 ++++-
 pyproject.toml                                |   4 +-
 tests/TODO.txt                                |   9 +
 10 files changed, 528 insertions(+), 143 deletions(-)
 create mode 100644 audio_separator/separator/architectures/mdxc_separator.py
 create mode 100644 audio_separator/separator/uvr_lib_v5/__init__.py
 create mode 100644 tests/TODO.txt

diff --git a/README.md b/README.md
index c999caa..acd8c0d 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 [![Docker pulls](https://img.shields.io/docker/pulls/beveradb/audio-separator.svg)](https://hub.docker.com/r/beveradb/audio-separator/tags)
 [![codecov](https://codecov.io/gh/karaokenerds/python-audio-separator/graph/badge.svg?token=N7YK4ET5JP)](https://codecov.io/gh/karaokenerds/python-audio-separator)
 
-Summary: Easy to use audio stem separation from the command line or as a dependency in your own Python project, using the amazing MDX-Net and VR Arch models available in UVR by @Anjok07 & @aufr33.
+Summary: Easy to use audio stem separation from the command line or as a dependency in your own Python project, using the amazing MDX-Net, VR Arch, Demucs and MDXC models available in UVR by @Anjok07 & @aufr33.
 
 Audio Separator is a Python package that allows you to separate an audio file into various stems, using models trained by @Anjok07 for use with UVR (https://github.com/Anjok07/ultimatevocalremovergui).
 
@@ -136,8 +136,9 @@ Any file listed in the list models output can be specified (with file extension)
 usage: audio-separator [-h] [-v] [-d] [-e] [-l] [--log_level LOG_LEVEL] [-m MODEL_FILENAME] [--output_format OUTPUT_FORMAT] [--output_dir OUTPUT_DIR] [--model_file_dir MODEL_FILE_DIR] [--invert_spect]
                        [--normalization NORMALIZATION] [--single_stem SINGLE_STEM] [--sample_rate SAMPLE_RATE] [--mdx_segment_size MDX_SEGMENT_SIZE] [--mdx_overlap MDX_OVERLAP] [--mdx_batch_size MDX_BATCH_SIZE]
                        [--mdx_hop_length MDX_HOP_LENGTH] [--mdx_enable_denoise] [--vr_batch_size VR_BATCH_SIZE] [--vr_window_size VR_WINDOW_SIZE] [--vr_aggression VR_AGGRESSION] [--vr_enable_tta]
-                       [--vr_high_end_process] [--vr_enable_post_process] [--vr_post_process_threshold VR_POST_PROCESS_THRESHOLD] [--demucs_stem DEMUCS_STEM] [--demucs_segment_size DEMUCS_SEGMENT_SIZE]
-                       [--demucs_shifts DEMUCS_SHIFTS] [--demucs_overlap DEMUCS_OVERLAP] [--demucs_segments_enabled DEMUCS_SEGMENTS_ENABLED]
+                       [--vr_high_end_process] [--vr_enable_post_process] [--vr_post_process_threshold VR_POST_PROCESS_THRESHOLD] [--demucs_segment_size DEMUCS_SEGMENT_SIZE] [--demucs_shifts DEMUCS_SHIFTS]
+                       [--demucs_overlap DEMUCS_OVERLAP] [--demucs_segments_enabled DEMUCS_SEGMENTS_ENABLED] [--mdxc_segment_size MDXC_SEGMENT_SIZE] [--mdxc_use_model_segment_size] [--mdxc_overlap MDXC_OVERLAP]
+                       [--mdxc_batch_size MDXC_BATCH_SIZE] [--mdxc_pitch_shift MDXC_PITCH_SHIFT]
                        [audio_file]
 
 Separate audio file into different stems.
@@ -149,11 +150,11 @@ options:
   -h, --help                                             show this help message and exit
 
 Info and Debugging:
-  -v, --version                                          show program's version number and exit
-  -d, --debug                                            enable debug logging, equivalent to --log_level=debug
-  -e, --env_info                                         print environment information and exit.
-  -l, --list_models                                      list all supported models and exit.
-  --log_level LOG_LEVEL                                  log level, e.g. info, debug, warning (default: info)
+  -v, --version                                          Show the program's version number and exit.
+  -d, --debug                                            Enable debug logging, equivalent to --log_level=debug.
+  -e, --env_info                                         Print environment information and exit.
+  -l, --list_models                                      List all supported models and exit.
+  --log_level LOG_LEVEL                                  Log level, e.g. info, debug, warning (default: info).
 
 Separation I/O Params:
   -m MODEL_FILENAME, --model_filename MODEL_FILENAME     model to use for separation (default: UVR-MDX-NET-Inst_HQ_3.onnx). Example: -m 2_HP-UVR.pth
@@ -164,7 +165,7 @@ Separation I/O Params:
 Common Separation Parameters:
   --invert_spect                                         invert secondary stem using spectogram (default: False). Example: --invert_spect
   --normalization NORMALIZATION                          max peak amplitude to normalize input and output audio to (default: 0.9). Example: --normalization=0.7
-  --single_stem SINGLE_STEM                              output only single stem, either instrumental or vocals. Example: --single_stem=instrumental
+  --single_stem SINGLE_STEM                              output only single stem, e.g. Instrumental, Vocals, Drums, Bass, Guitar, Piano, Other. Example: --single_stem=Instrumental
   --sample_rate SAMPLE_RATE                              modify the sample rate of the output audio (default: 44100). Example: --sample_rate=44100
 
 MDX Architecture Parameters:
@@ -184,11 +185,17 @@ VR Architecture Parameters:
   --vr_post_process_threshold VR_POST_PROCESS_THRESHOLD  threshold for post_process feature: 0.1-0.3 (default: 0.2). Example: --vr_post_process_threshold=0.1
 
 Demucs Architecture Parameters:
-  --demucs_stem DEMUCS_STEM                              stem to extract from audio file, e.g. Vocals, Drums, Bass, Other (default: All Stems). Example: --demucs_stem=vocals
   --demucs_segment_size DEMUCS_SEGMENT_SIZE              size of segments into which the audio is split, 1-100. higher = slower but better quality (default: Default). Example: --demucs_segment_size=256
   --demucs_shifts DEMUCS_SHIFTS                          number of predictions with random shifts, higher = slower but better quality (default: 2). Example: --demucs_shifts=4
   --demucs_overlap DEMUCS_OVERLAP                        overlap between prediction windows, 0.001-0.999. higher = slower but better quality (default: 0.25). Example: --demucs_overlap=0.25
   --demucs_segments_enabled DEMUCS_SEGMENTS_ENABLED      enable segment-wise processing (default: True). Example: --demucs_segments_enabled=False
+
+MDXC Architecture Parameters:
+  --mdxc_segment_size MDXC_SEGMENT_SIZE                  larger consumes more resources, but may give better results (default: 256). Example: --mdxc_segment_size=256
+  --mdxc_use_model_segment_size                          use model default segment size instead of the value from the config file. Example: --mdxc_use_model_segment_size
+  --mdxc_overlap MDXC_OVERLAP                            amount of overlap between prediction windows, 2-50. higher is better but slower (default: 8). Example: --mdxc_overlap=8
+  --mdxc_batch_size MDXC_BATCH_SIZE                      larger consumes more RAM but may process slightly faster (default: 1). Example: --mdxc_batch_size=4
+  --mdxc_pitch_shift MDXC_PITCH_SHIFT                    shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals. (default: 0). Example: --mdxc_pitch_shift=2
 ```
 
 ### As a Dependency in a Python Project
@@ -348,6 +355,7 @@ This project is licensed under the MIT [License](LICENSE).
 - [Kuielab & Woosung Choi](https://github.com/kuielab) - Developed the original MDX-Net AI code. 
 - [KimberleyJSN](https://github.com/KimberleyJensen) - Advised and aided the implementation of the training scripts for MDX-Net and Demucs. Thank you!
 - [Hv](https://github.com/NaJeongMo/Colab-for-MDX_B) - Helped implement chunks into the MDX-Net AI code. Thank you!
+- [zhzhongshi](https://github.com/zhzhongshi) - Helped add support for the MDXC models in `audio-separator`. Thank you!
 
 ## Contact 💌
 
diff --git a/audio_separator/separator/architectures/__init__.py b/audio_separator/separator/architectures/__init__.py
index 9139bab..5d1de84 100644
--- a/audio_separator/separator/architectures/__init__.py
+++ b/audio_separator/separator/architectures/__init__.py
@@ -1,3 +1,4 @@
 from .mdx_separator import MDXSeparator
 from .vr_separator import VRSeparator
 from .demucs_separator import DemucsSeparator
+from .mdxc_separator import MDXCSeparator
\ No newline at end of file
diff --git a/audio_separator/separator/architectures/mdx_separator.py b/audio_separator/separator/architectures/mdx_separator.py
index 4d2e687..e74ec48 100644
--- a/audio_separator/separator/architectures/mdx_separator.py
+++ b/audio_separator/separator/architectures/mdx_separator.py
@@ -90,8 +90,28 @@ def __init__(self, common_config, arch_config):
         # We haven't implemented support for the checkpoint models here, so we're not using it.
         # self.dim_c = 4
 
-        # Loading the model for inference
+        self.load_model()
+
+        self.n_bins = 0
+        self.trim = 0
+        self.chunk_size = 0
+        self.gen_size = 0
+        self.stft = None
+
+        self.primary_source = None
+        self.secondary_source = None
+        self.audio_file_path = None
+        self.audio_file_base = None
+        self.secondary_source_map = None
+        self.primary_source_map = None
+
+    def load_model(self):
+        """
+        Load the model into memory from file on disk, initialize it with config from the model data,
+        and prepare for inferencing using hardware accelerated Torch device.
+        """
         self.logger.debug("Loading ONNX model for inference...")
+
         if self.segment_size == self.dim_t:
             ort_session_options = ort.SessionOptions()
             if self.log_level > 10:
@@ -107,19 +127,6 @@ def __init__(self, common_config, arch_config):
             self.model_run.to(self.torch_device).eval()
             self.logger.warning("Model converted from onnx to pytorch due to segment size not matching dim_t, processing may be slower.")
 
-        self.n_bins = 0
-        self.trim = 0
-        self.chunk_size = 0
-        self.gen_size = 0
-        self.stft = None
-
-        self.primary_source = None
-        self.secondary_source = None
-        self.audio_file_path = None
-        self.audio_file_base = None
-        self.secondary_source_map = None
-        self.primary_source_map = None
-
     def separate(self, audio_file_path):
         """
         Separates the audio file into primary and secondary sources based on the model's configuration.
diff --git a/audio_separator/separator/architectures/mdxc_separator.py b/audio_separator/separator/architectures/mdxc_separator.py
new file mode 100644
index 0000000..115b1fa
--- /dev/null
+++ b/audio_separator/separator/architectures/mdxc_separator.py
@@ -0,0 +1,257 @@
+import os
+import sys
+
+import torch
+import numpy as np
+from tqdm import tqdm
+from ml_collections import ConfigDict
+
+from audio_separator.separator.common_separator import CommonSeparator
+from audio_separator.separator.uvr_lib_v5.tfc_tdf_v3 import TFC_TDF_net
+from audio_separator.separator.uvr_lib_v5 import spec_utils
+
+
+class MDXCSeparator(CommonSeparator):
+    """
+    MDXCSeparator is responsible for separating audio sources using MDXC models.
+    It initializes with configuration parameters and prepares the model for separation tasks.
+    """
+
+    def __init__(self, common_config, arch_config):
+        # Any configuration values which can be shared between architectures should be set already in CommonSeparator,
+        # e.g. user-specified functionality choices (self.output_single_stem) or common model parameters (self.primary_stem_name)
+        super().__init__(config=common_config)
+
+        # Model data is basic overview metadata about the model, e.g. which stem is primary and whether it's a karaoke model
+        # It's loaded in from model_data_new.json in Separator.load_model and there are JSON examples in that method
+        # The instance variable self.model_data is passed through from Separator and set in CommonSeparator
+        self.logger.debug(f"Model data: {self.model_data}")
+
+        # Arch Config is the MDXC architecture specific user configuration options, which should all be configurable by the user
+        # either by their Separator class instantiation or by passing in a CLI parameter.
+        # While there are similarities between architectures for some of these (e.g. batch_size), they are deliberately configured
+        # this way as they have architecture-specific default values.
+        self.segment_size = arch_config.get("segment_size", 256)
+        self.use_model_segment_size = arch_config.get("use_model_segment_size", False)
+
+        self.overlap = arch_config.get("overlap", 8)
+        self.batch_size = arch_config.get("batch_size", 1)
+
+        # Amount of pitch shift to apply during processing (this does NOT affect the pitch of the output audio):
+        # • Whole numbers indicate semitones.
+        # • Using higher pitches may cut the upper bandwidth, even in high-quality models.
+        # • Upping the pitch can be better for tracks with deeper vocals.
+        # • Dropping the pitch may take more processing time but works well for tracks with high-pitched vocals.
+        self.pitch_shift = arch_config.get("pitch_shift", 0)
+
+        self.logger.debug(f"MDXC arch params: batch_size={self.batch_size}, segment_size={self.segment_size}, overlap={self.overlap}")
+        self.logger.debug(f"MDXC arch params: use_model_segment_size={self.use_model_segment_size}, pitch_shift={self.pitch_shift}")
+
+        self.load_model()
+
+        self.primary_source = None
+        self.secondary_source = None
+        self.audio_file_path = None
+        self.audio_file_base = None
+        self.primary_source_map = None
+        self.secondary_source_map = None
+
+        self.logger.info("MDXC Separator initialisation complete")
+
+    def load_model(self):
+        """
+        Load the model into memory from file on disk, initialize it with config from the model data,
+        and prepare for inferencing using hardware accelerated Torch device.
+        """
+        self.logger.debug("Loading checkpoint model for inference...")
+
+        self.model_data_cfgdict = ConfigDict(self.model_data)
+
+        try:
+            self.model_run = TFC_TDF_net(self.model_data_cfgdict, device=self.torch_device)
+            self.model_run.load_state_dict(torch.load(self.model_path, map_location="cpu"))
+            self.model_run.to(self.torch_device).eval()
+        except RuntimeError as e:
+            self.logger.error(f"Error: {e}")
+            self.logger.error("An error occurred while loading the model file. This often occurs when the model file is corrupt or incomplete.")
+            self.logger.error(f"Please try deleting the model file from {self.model_path} and run audio-separator again to re-download it.")
+            sys.exit(1)
+
+    def separate(self, audio_file_path):
+        """
+        Separates the audio file into primary and secondary sources based on the model's configuration.
+        It processes the mix, demixes it into sources, normalizes the sources, and saves the output files.
+
+        Args:
+            audio_file_path (str): The path to the audio file to be processed.
+
+        Returns:
+            list: A list of paths to the output files generated by the separation process.
+        """
+        self.primary_source = None
+        self.secondary_source = None
+
+        self.audio_file_path = audio_file_path
+        self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0]
+
+        self.logger.debug("Preparing mix...")
+        mix = self.prepare_mix(self.audio_file_path)
+
+        self.logger.debug("Normalizing mix before demixing...")
+        mix = spec_utils.normalize(wave=mix, max_peak=self.normalization_threshold)
+
+        source = self.demix(mix=mix)
+        self.logger.debug("Demixing completed.")
+
+        output_files = []
+        self.logger.debug("Processing output files...")
+
+        if not isinstance(self.primary_source, np.ndarray):
+            self.logger.debug("Normalizing primary source...")
+            self.primary_source = spec_utils.normalize(wave=source[self.primary_stem_name], max_peak=self.normalization_threshold).T
+
+        if not isinstance(self.secondary_source, np.ndarray):
+            self.logger.debug("Normalizing secondary source...")
+            self.secondary_source = spec_utils.normalize(wave=source[self.secondary_stem_name], max_peak=self.normalization_threshold).T
+
+        if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower():
+            self.logger.info(f"Saving {self.secondary_stem_name} stem...")
+            if not self.secondary_stem_output_path:
+                self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}")
+            self.secondary_source_map = self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name)
+            output_files.append(self.secondary_stem_output_path)
+
+        if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower():
+            self.logger.info(f"Saving {self.primary_stem_name} stem...")
+            if not self.primary_stem_output_path:
+                self.primary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}")
+            if not isinstance(self.primary_source, np.ndarray):
+                self.primary_source = source.T
+            self.primary_source_map = self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name)
+            output_files.append(self.primary_stem_output_path)
+        return output_files
+
+    def pitch_fix(self, source, sr_pitched, org_mix):
+        """
+        Change the pitch of the source audio by a number of semitones.
+
+        Args:
+            source (np.ndarray): The source audio to be pitch-shifted.
+            sr_pitched (int): The sample rate of the pitch-shifted audio.
+            org_mix (np.ndarray): The original mix, used to match the shape of the pitch-shifted audio.
+
+        Returns:
+            np.ndarray: The pitch-shifted source audio.
+        """
+        source = spec_utils.change_pitch_semitones(source, sr_pitched, semitone_shift=self.pitch_shift)[0]
+        source = spec_utils.match_array_shapes(source, org_mix)
+        return source
+
+    def demix(self, mix: np.ndarray) -> dict:
+        """
+        Demixes the input mix into primary and secondary sources using the model and model data.
+
+        Args:
+            mix (np.ndarray): The mix to be demixed.
+        Returns:
+            dict: A dictionary containing the demixed sources.
+        """
+        orig_mix = mix
+
+        if self.pitch_shift != 0:
+            self.logger.debug(f"Shifting pitch by -{self.pitch_shift} semitones...")
+            mix, sample_rate = spec_utils.change_pitch_semitones(mix, self.sample_rate, semitone_shift=-self.pitch_shift)
+
+        mix = torch.tensor(mix, dtype=torch.float32)
+
+        try:
+            num_stems = self.model_run.num_target_instruments
+        except AttributeError:
+            num_stems = self.model_run.module.num_target_instruments
+        self.logger.debug(f"Number of stems: {num_stems}")
+
+        if self.use_model_segment_size:
+            mdx_segment_size = self.model_data_cfgdict.inference.dim_t
+            self.logger.debug(f"Using model default segment size: {mdx_segment_size}")
+        else:
+            mdx_segment_size = self.segment_size
+            self.logger.debug(f"Using configured segment size: {mdx_segment_size}")
+
+        chunk_size = self.model_data_cfgdict.audio.hop_length * (mdx_segment_size - 1)
+        self.logger.debug(f"Chunk size: {chunk_size}")
+
+        hop_size = chunk_size // self.overlap
+        self.logger.debug(f"Hop size: {hop_size}")
+
+        mix_shape = mix.shape[1]
+        pad_size = hop_size - (mix_shape - chunk_size) % hop_size
+        self.logger.debug(f"Pad size: {pad_size}")
+
+        mix = torch.cat([torch.zeros(2, chunk_size - hop_size), mix, torch.zeros(2, pad_size + chunk_size - hop_size)], 1)
+        self.logger.debug(f"Mix shape: {mix.shape}")
+
+        chunks = mix.unfold(1, chunk_size, hop_size).transpose(0, 1)
+        self.logger.debug(f"Chunks length: {len(chunks)} and shape: {chunks.shape}")
+
+        batches = [chunks[i : i + self.batch_size] for i in range(0, len(chunks), self.batch_size)]
+        self.logger.debug(f"Batch size: {self.batch_size}, number of batches: {len(batches)}")
+
+        # accumulated_outputs is used to accumulate the output from processing each batch of chunks through the model.
+        # It starts as a tensor of zeros and is updated in-place as the model processes each batch.
+        # The variable holds the combined result of all processed batches, which, after post-processing, represents the separated audio sources.
+        accumulated_outputs = torch.zeros(num_stems, *mix.shape) if num_stems > 1 else torch.zeros_like(mix)
+        accumulated_outputs = accumulated_outputs.to(self.torch_device)
+
+        with torch.no_grad():
+            count = 0
+            for batch in tqdm(batches):
+                # Since the model processes the audio data in batches, single_batch_result temporarily holds the model's output
+                # for each batch before it is accumulated into accumulated_outputs.
+                single_batch_result = self.model_run(batch.to(self.torch_device))
+
+                # Each individual output tensor from the current batch's processing result.
+                # Since single_batch_result can contain multiple output tensors (one for each piece of audio in the batch),
+                # individual_output is used to iterate through these tensors and accumulate them into accumulated_outputs.
+                for individual_output in single_batch_result:
+                    accumulated_outputs[..., count * hop_size : count * hop_size + chunk_size] += individual_output
+                    count += 1
+
+        self.logger.debug("Calculating inferenced outputs based on accumulated outputs and overlap")
+        inferenced_outputs = accumulated_outputs[..., chunk_size - hop_size : -(pad_size + chunk_size - hop_size)] / self.overlap
+        self.logger.debug("Deleting accumulated outputs to free up memory")
+        del accumulated_outputs
+
+        if num_stems > 1:
+            self.logger.debug("Number of stems is greater than 1, detaching individual sources and correcting pitch if necessary...")
+
+            sources = {}
+
+            # Iterates over each instrument specified in the model's configuration and its corresponding separated audio source.
+            # self.model_data_cfgdict.training.instruments provides the list of stems.
+            # estimated_sources.cpu().detach().numpy() converts the separated sources tensor to a NumPy array for processing.
+            # Each iteration provides an instrument name ('key') and its separated audio ('value') for further processing.
+            for key, value in zip(self.model_data_cfgdict.training.instruments, inferenced_outputs.cpu().detach().numpy()):
+                self.logger.debug(f"Processing instrument: {key}")
+                if self.pitch_shift != 0:
+                    self.logger.debug(f"Applying pitch correction for {key}")
+                    sources[key] = self.pitch_fix(value, sample_rate, orig_mix)
+                else:
+                    sources[key] = value
+
+            self.logger.debug("Deleting inferenced outputs to free up memory")
+            del inferenced_outputs
+
+            self.logger.debug("Returning separated sources")
+            return sources
+
+        self.logger.debug("Detaching inferenced output for single instrument scenario")
+        inferenced_output = inferenced_outputs.cpu().detach().numpy()
+        self.logger.debug("Deleting inferenced outputs to free up memory")
+        del inferenced_outputs
+
+        if self.pitch_shift != 0:
+            self.logger.debug("Applying pitch correction for single instrument")
+            return self.pitch_fix(inferenced_output, sample_rate, orig_mix)
+
+        self.logger.debug("Returning inferenced output for single instrument")
+        return inferenced_output
diff --git a/audio_separator/separator/separator.py b/audio_separator/separator/separator.py
index 2fe9345..13e1db5 100644
--- a/audio_separator/separator/separator.py
+++ b/audio_separator/separator/separator.py
@@ -4,18 +4,17 @@
 import os
 import platform
 import subprocess
+import time
+import logging
+import warnings
 
 import hashlib
 import json
 import yaml
-import time
-import logging
-import warnings
 import requests
 import torch
 import onnxruntime as ort
-
-from audio_separator.separator.common_separator import CommonSeparator
+from tqdm import tqdm
 
 
 class Separator:
@@ -78,6 +77,7 @@ def __init__(
         mdx_params={"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1, "enable_denoise": False},
         vr_params={"batch_size": 16, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False},
         demucs_params={"segment_size": "Default", "shifts": 2, "overlap": 0.25, "segments_enabled": True},
+        mdxc_params={"segment_size": 256, "batch_size": 1, "overlap": 8},
     ):
         self.logger = logging.getLogger(__name__)
         self.logger.setLevel(log_level)
@@ -133,7 +133,7 @@ def __init__(
 
         # These are parameters which users may want to configure so we expose them to the top-level Separator class,
         # even though they are specific to a single model architecture
-        self.arch_specific_params = {"MDX": mdx_params, "VR": vr_params, "Demucs": demucs_params}
+        self.arch_specific_params = {"MDX": mdx_params, "VR": vr_params, "Demucs": demucs_params, "MDXC": mdxc_params}
 
         self.torch_device = None
         self.torch_device_cpu = None
@@ -293,9 +293,14 @@ def download_file_if_not_exists(self, url, output_path):
         response = requests.get(url, stream=True, timeout=300)
 
         if response.status_code == 200:
+            total_size_in_bytes = int(response.headers.get("content-length", 0))
+            progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+
             with open(output_path, "wb") as f:
                 for chunk in response.iter_content(chunk_size=8192):
+                    progress_bar.update(len(chunk))
                     f.write(chunk)
+            progress_bar.close()
         else:
             raise RuntimeError(f"Failed to download file from {url}, response code: {response.status_code}")
 
@@ -358,8 +363,8 @@ def list_supported_model_files(self):
             "VR": model_downloads_list["vr_download_list"],
             "MDX": model_downloads_list["mdx_download_list"],
             "Demucs": filtered_demucs_v4,
+            "MDXC": model_downloads_list["mdx23c_download_list"],
             # "MDX23": model_downloads_list["mdx23_download_list"],
-            # "MDX23C": model_downloads_list["mdx23c_download_list"],
         }
         return model_files_grouped_by_type
 
@@ -372,6 +377,8 @@ def download_model_files(self, model_filename):
         supported_model_files_grouped = self.list_supported_model_files()
         model_repo_url_prefix = "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models"
 
+        yaml_config_filename = None
+
         self.logger.debug(f"Searching for model_filename {model_filename} in supported_model_files_grouped")
         for model_type, model_list in supported_model_files_grouped.items():
             for model_friendly_name, model_download_list in model_list.items():
@@ -382,7 +389,7 @@ def download_model_files(self, model_filename):
                     self.download_file_if_not_exists(f"{model_repo_url_prefix}/{model_filename}", model_path)
 
                     self.logger.debug(f"Returning path for single model file: {model_path}")
-                    return model_type, model_friendly_name, model_path
+                    return model_type, model_friendly_name, model_path, yaml_config_filename
 
                 # If it's a dict, iterate through each entry check if any of them match model_filename
                 # If the value is a full URL, download it from that URL.
@@ -397,24 +404,51 @@ def download_model_files(self, model_filename):
                     if this_model_matches_input_filename:
                         self.logger.debug(f"Multi-file model identified: {model_friendly_name}, iterating through files to download")
 
-                        for file_name, file_url in model_download_list.items():
-                            # Demucs models have full URLs to download from Facebook repos
-                            if file_url.startswith("http"):
-                                full_file_url = file_url
-                            # Checkpoint models apparently use the key as the filename, the value is a filename of a YAML config file...
-                            elif file_name.endswith(".ckpt"):
-                                full_file_url = f"{model_repo_url_prefix}/{file_name}"
-                            # MDX and VR models have the model filename as the value mapped to the model name
-                            else:
-                                full_file_url = f"{model_repo_url_prefix}/{file_url}"
+                        for config_key, config_value in model_download_list.items():
+                            self.logger.debug(f"Attempting to identify download URL for config pair: {config_key} -> {config_value}")
 
-                            self.download_file_if_not_exists(full_file_url, os.path.join(self.model_file_dir, file_name))
+                            # Demucs models have full URLs to download from Facebook repos, and config_key is set to the file name
+                            if config_value.startswith("http"):
+                                self.download_file_if_not_exists(config_value, os.path.join(self.model_file_dir, config_key))
+
+                            # Checkpoint models apparently use config_key as the model filename, but the value is a YAML config file name...
+                            # Both need to be downloaded, but the model data YAML file actually comes from the application data repo...
+                            elif config_key.endswith(".ckpt"):
+                                download_url = f"{model_repo_url_prefix}/{config_key}"
+                                self.download_file_if_not_exists(download_url, os.path.join(self.model_file_dir, config_key))
+
+                                # For MDXC models, the config_value is the YAML file which needs to be downloaded separately from the application_data repo
+                                yaml_config_filename = config_value
+                                yaml_config_filepath = os.path.join(self.model_file_dir, yaml_config_filename)
+
+                                # Repo for model data and configuration sources from UVR
+                                model_data_url_prefix = "https://raw.githubusercontent.com/TRvlvr/application_data/main"
+                                yaml_config_url = f"{model_data_url_prefix}/mdx_model_data/mdx_c_configs/{yaml_config_filename}"
+
+                                self.download_file_if_not_exists(f"{yaml_config_url}", yaml_config_filepath)
+
+                            # MDX and VR models have config_value set to the model filename
+                            else:
+                                download_url = f"{model_repo_url_prefix}/{config_value}"
+                                self.download_file_if_not_exists(download_url, os.path.join(self.model_file_dir, config_value))
 
                         self.logger.debug(f"All files downloaded for model {model_friendly_name}, returning initial path {model_path}")
-                        return model_type, model_friendly_name, model_path
+                        return model_type, model_friendly_name, model_path, yaml_config_filename
 
         raise ValueError(f"Model file {model_filename} not found in supported model files")
 
+    def load_model_data_from_yaml(self, yaml_config_filename):
+        """
+        This method loads model-specific parameters from the YAML file for that model.
+        The parameters in the YAML are critical to inferencing, as they need to match whatever was used during training.
+        """
+        model_data_yaml_filepath = os.path.join(self.model_file_dir, yaml_config_filename)
+        self.logger.debug(f"Loading model data from YAML at path {model_data_yaml_filepath}")
+
+        model_data = yaml.load(open(model_data_yaml_filepath, encoding="utf-8"), Loader=yaml.FullLoader)
+        self.logger.debug(f"Model data loaded from YAML file: {model_data}")
+        return model_data
+
     def load_model_data_using_hash(self, model_path):
         """
         This method loads model-specific parameters from UVR model data files.
@@ -426,7 +460,6 @@ def load_model_data_using_hash(self, model_path):
 
         vr_model_data_url = f"{model_data_url_prefix}/vr_model_data/model_data_new.json"
         mdx_model_data_url = f"{model_data_url_prefix}/mdx_model_data/model_data_new.json"
-        mdx_c_configs_url_prefix = f"{model_data_url_prefix}/mdx_model_data/mdx_c_configs"
 
         # Calculate hash for the downloaded model
         self.logger.debug("Calculating MD5 hash for model file to identify model parameters from UVR data...")
@@ -527,35 +560,16 @@ def load_model_data_using_hash(self, model_path):
         elif model_hash in vr_model_data_object:
             model_data = vr_model_data_object[model_hash]
         else:
-            raise ValueError(f"Unsupported Model File: parameters for MD5 hash {model_hash} could not be found in the UVR model data file.")
+            raise ValueError(f"Unsupported Model File: parameters for MD5 hash {model_hash} could not be found in UVR model data file for MDX or VR arch.")
 
         self.logger.debug(f"Model data loaded from UVR JSON using hash {model_hash}: {model_data}")
 
-        # MDX_C model data is actually stored in a YAML file which has to be downloaded separately
-        if "config_yaml" in model_data:
-            self.logger.debug(f"Model data had config_yaml key, fetching actual model data YAML from MDX_C config URL")
-
-            mdxc_model_data_path = os.path.join(self.model_file_dir, model_data["config_yaml"])
-            self.logger.debug(f"MDX_C model data path set to {mdxc_model_data_path}")
-            self.download_file_if_not_exists(f"{mdx_c_configs_url_prefix}/{model_data['config_yaml']}", mdxc_model_data_path)
-
-            model_data = self.load_model_data_from_yaml(mdxc_model_data_path)
-
-        return model_data
-
-    def load_model_data_from_yaml(self, model_path):
-        """
-        This method loads model-specific parameters from the YAML file for that model.
-        The parameters in the YAML are critical to inferencing, as they need to match whatever was used during training.
-        """
-        model_data = yaml.load(open(model_path, encoding="utf-8"), Loader=yaml.FullLoader)
-
-        self.logger.debug(f"Model data loaded from YAML file: {model_data}")
         return model_data
 
     def load_model(self, model_filename="UVR-MDX-NET-Inst_HQ_3.onnx"):
         """
-        This method loads the separation model into memory, downloading it first if necessary.
+        This method instantiates the architecture-specific separation class,
+        loading the separation model into memory, downloading it first if necessary.
         """
         self.logger.info(f"Loading model {model_filename}...")
 
@@ -563,11 +577,14 @@ def load_model(self, model_filename="UVR-MDX-NET-Inst_HQ_3.onnx"):
 
         # Setting up the model path
         model_name = model_filename.split(".")[0]
-        model_type, model_friendly_name, model_path = self.download_model_files(model_filename)
-        self.logger.debug(f"Model downloaded, friendly name: {model_friendly_name}")
+        model_type, model_friendly_name, model_path, yaml_config_filename = self.download_model_files(model_filename)
+        self.logger.debug(f"Model downloaded, friendly name: {model_friendly_name}, model_path: {model_path}")
 
         if model_path.lower().endswith(".yaml"):
-            model_data = self.load_model_data_from_yaml(model_path)
+            yaml_config_filename = model_path
+
+        if yaml_config_filename is not None:
+            model_data = self.load_model_data_from_yaml(yaml_config_filename)
         else:
             model_data = self.load_model_data_using_hash(model_path)
 
@@ -595,12 +612,13 @@ def load_model(self, model_filename="UVR-MDX-NET-Inst_HQ_3.onnx"):
             raise ValueError(f"Model type not supported (yet): {model_type}")
 
         # Instantiate the appropriate separator class depending on the model type
-        separator_classes = {"MDX": "MDXSeparator", "VR": "VRSeparator", "Demucs": "DemucsSeparator"}
+        separator_classes = {"MDX": "MDXSeparator", "VR": "VRSeparator", "Demucs": "DemucsSeparator", "MDXC": "MDXCSeparator"}
 
         if model_type not in separator_classes:
             raise ValueError(f"Model type not supported (yet): {model_type}")
 
         module = __import__("audio_separator.separator.architectures", fromlist=[separator_classes[model_type]])
+
         separator_class = getattr(module, separator_classes[model_type])
         self.model_instance = separator_class(common_config=common_params, arch_config=self.arch_specific_params[model_type])
 
diff --git a/audio_separator/separator/uvr_lib_v5/__init__.py b/audio_separator/separator/uvr_lib_v5/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/audio_separator/utils/cli.py b/audio_separator/utils/cli.py
index 3600e8e..23afd17 100755
--- a/audio_separator/utils/cli.py
+++ b/audio_separator/utils/cli.py
@@ -2,8 +2,11 @@
 import argparse
 import logging
 import json
+import sys
 from importlib import metadata
 
+from audio_separator.separator import Separator
+
 
 def main():
     """Main entry point for the CLI."""
@@ -20,69 +23,96 @@ def main():
 
     package_version = metadata.distribution("audio-separator").version
 
+    version_help = "Show the program's version number and exit."
+    debug_help = "Enable debug logging, equivalent to --log_level=debug."
+    env_info_help = "Print environment information and exit."
+    list_models_help = "List all supported models and exit."
+    log_level_help = "Log level, e.g. info, debug, warning (default: %(default)s)."
+
     info_params = parser.add_argument_group("Info and Debugging")
-    info_params.add_argument("-v", "--version", action="version", version=f"%(prog)s {package_version}")
-    info_params.add_argument("-d", "--debug", action="store_true", help="enable debug logging, equivalent to --log_level=debug")
-    info_params.add_argument("-e", "--env_info", action="store_true", help="print environment information and exit.")
-    info_params.add_argument("-l", "--list_models", action="store_true", help="list all supported models and exit.")
-    info_params.add_argument("--log_level", default="info", help="log level, e.g. info, debug, warning (default: %(default)s)")
+    info_params.add_argument("-v", "--version", action="version", version=f"%(prog)s {package_version}", help=version_help)
+    info_params.add_argument("-d", "--debug", action="store_true", help=debug_help)
+    info_params.add_argument("-e", "--env_info", action="store_true", help=env_info_help)
+    info_params.add_argument("-l", "--list_models", action="store_true", help=list_models_help)
+    info_params.add_argument("--log_level", default="info", help=log_level_help)
+
+    model_filename_help = "model to use for separation (default: %(default)s). Example: -m 2_HP-UVR.pth"
+    output_format_help = "output format for separated files, any common format (default: %(default)s). Example: --output_format=MP3"
+    output_dir_help = "directory to write output files (default: <current dir>). Example: --output_dir=/app/separated"
+    model_file_dir_help = "model files directory (default: %(default)s). Example: --model_file_dir=/app/models"
 
     io_params = parser.add_argument_group("Separation I/O Params")
-    io_params.add_argument("-m", "--model_filename", default="UVR-MDX-NET-Inst_HQ_3.onnx", help="model to use for separation (default: %(default)s). Example: -m 2_HP-UVR.pth")
-    io_params.add_argument("--output_format", default="FLAC", help="output format for separated files, any common format (default: %(default)s). Example: --output_format=MP3")
-    io_params.add_argument("--output_dir", default=None, help="directory to write output files (default: <current dir>). Example: --output_dir=/app/separated")
-    io_params.add_argument("--model_file_dir", default="/tmp/audio-separator-models/", help="model files directory (default: %(default)s). Example: --model_file_dir=/app/models")
+    io_params.add_argument("-m", "--model_filename", default="UVR-MDX-NET-Inst_HQ_3.onnx", help=model_filename_help)
+    io_params.add_argument("--output_format", default="FLAC", help=output_format_help)
+    io_params.add_argument("--output_dir", default=None, help=output_dir_help)
+    io_params.add_argument("--model_file_dir", default="/tmp/audio-separator-models/", help=model_file_dir_help)
+
+    invert_spect_help = "invert secondary stem using spectogram (default: %(default)s). Example: --invert_spect"
+    normalization_help = "max peak amplitude to normalize input and output audio to (default: %(default)s). Example: --normalization=0.7"
+    single_stem_help = "output only single stem, e.g. Instrumental, Vocals, Drums, Bass, Guitar, Piano, Other. Example: --single_stem=Instrumental"
+    sample_rate_help = "modify the sample rate of the output audio (default: %(default)s). Example: --sample_rate=44100"
 
     common_params = parser.add_argument_group("Common Separation Parameters")
-    common_params.add_argument("--invert_spect", action="store_true", help="invert secondary stem using spectogram (default: %(default)s). Example: --invert_spect")
-    common_params.add_argument("--normalization", type=float, default=0.9, help="max peak amplitude to normalize input and output audio to (default: %(default)s). Example: --normalization=0.7")
-    common_params.add_argument("--single_stem", default=None, help="output only single stem, e.g. Instrumental, Vocals, Drums, Bass, Guitar, Piano, Other. Example: --single_stem=Instrumental")
-    common_params.add_argument("--sample_rate", type=int, default=44100, help="modify the sample rate of the output audio (default: %(default)s). Example: --sample_rate=44100")
+    common_params.add_argument("--invert_spect", action="store_true", help=invert_spect_help)
+    common_params.add_argument("--normalization", type=float, default=0.9, help=normalization_help)
+    common_params.add_argument("--single_stem", default=None, help=single_stem_help)
+    common_params.add_argument("--sample_rate", type=int, default=44100, help=sample_rate_help)
+
+    mdx_segment_size_help = "larger consumes more resources, but may give better results (default: %(default)s). Example: --mdx_segment_size=256"
+    mdx_overlap_help = "amount of overlap between prediction windows, 0.001-0.999. higher is better but slower (default: %(default)s). Example: --mdx_overlap=0.25"
+    mdx_batch_size_help = "larger consumes more RAM but may process slightly faster (default: %(default)s). Example: --mdx_batch_size=4"
+    mdx_hop_length_help = "usually called stride in neural networks, only change if you know what you're doing (default: %(default)s). Example: --mdx_hop_length=1024"
+    mdx_enable_denoise_help = "enable denoising during separation (default: %(default)s). Example: --mdx_enable_denoise"
 
     mdx_params = parser.add_argument_group("MDX Architecture Parameters")
-    mdx_params.add_argument("--mdx_segment_size", type=int, default=256, help="larger consumes more resources, but may give better results (default: %(default)s). Example: --mdx_segment_size=256")
-    mdx_params.add_argument(
-        "--mdx_overlap", type=float, default=0.25, help="amount of overlap between prediction windows, 0.001-0.999. higher is better but slower (default: %(default)s). Example: --mdx_overlap=0.25"
-    )
-    mdx_params.add_argument("--mdx_batch_size", type=int, default=1, help="larger consumes more RAM but may process slightly faster (default: %(default)s). Example: --mdx_batch_size=4")
-    mdx_params.add_argument(
-        "--mdx_hop_length", type=int, default=1024, help="usually called stride in neural networks, only change if you know what you're doing (default: %(default)s). Example: --mdx_hop_length=1024"
-    )
-    mdx_params.add_argument("--mdx_enable_denoise", action="store_true", help="enable denoising during separation (default: %(default)s). Example: --mdx_enable_denoise")
+    mdx_params.add_argument("--mdx_segment_size", type=int, default=256, help=mdx_segment_size_help)
+    mdx_params.add_argument("--mdx_overlap", type=float, default=0.25, help=mdx_overlap_help)
+    mdx_params.add_argument("--mdx_batch_size", type=int, default=1, help=mdx_batch_size_help)
+    mdx_params.add_argument("--mdx_hop_length", type=int, default=1024, help=mdx_hop_length_help)
+    mdx_params.add_argument("--mdx_enable_denoise", action="store_true", help=mdx_enable_denoise_help)
+
+    vr_batch_size_help = "number of batches to process at a time. higher = more RAM, slightly faster processing (default: %(default)s). Example: --vr_batch_size=16"
+    vr_window_size_help = "balance quality and speed. 1024 = fast but lower, 320 = slower but better quality. (default: %(default)s). Example: --vr_window_size=320"
+    vr_aggression_help = "intensity of primary stem extraction, -100 - 100. typically 5 for vocals & instrumentals (default: %(default)s). Example: --vr_aggression=2"
+    vr_enable_tta_help = "enable Test-Time-Augmentation; slow but improves quality (default: %(default)s). Example: --vr_enable_tta"
+    vr_high_end_process_help = "mirror the missing frequency range of the output (default: %(default)s). Example: --vr_high_end_process"
+    vr_enable_post_process_help = "identify leftover artifacts within vocal output; may improve separation for some songs (default: %(default)s). Example: --vr_enable_post_process"
+    vr_post_process_threshold_help = "threshold for post_process feature: 0.1-0.3 (default: %(default)s). Example: --vr_post_process_threshold=0.1"
 
     vr_params = parser.add_argument_group("VR Architecture Parameters")
-    vr_params.add_argument(
-        "--vr_batch_size", type=int, default=4, help="number of batches to process at a time. higher = more RAM, slightly faster processing (default: %(default)s). Example: --vr_batch_size=16"
-    )
-    vr_params.add_argument(
-        "--vr_window_size", type=int, default=512, help="balance quality and speed. 1024 = fast but lower, 320 = slower but better quality. (default: %(default)s). Example: --vr_window_size=320"
-    )
-    vr_params.add_argument(
-        "--vr_aggression", type=int, default=5, help="intensity of primary stem extraction, -100 - 100. typically 5 for vocals & instrumentals (default: %(default)s). Example: --vr_aggression=2"
-    )
-    vr_params.add_argument("--vr_enable_tta", action="store_true", help="enable Test-Time-Augmentation; slow but improves quality (default: %(default)s). Example: --vr_enable_tta")
-    vr_params.add_argument("--vr_high_end_process", action="store_true", help="mirror the missing frequency range of the output (default: %(default)s). Example: --vr_high_end_process")
-    vr_params.add_argument(
-        "--vr_enable_post_process",
-        action="store_true",
-        help="identify leftover artifacts within vocal output; may improve separation for some songs (default: %(default)s). Example: --vr_enable_post_process",
-    )
-    vr_params.add_argument("--vr_post_process_threshold", type=float, default=0.2, help="threshold for post_process feature: 0.1-0.3 (default: %(default)s). Example: --vr_post_process_threshold=0.1")
+    vr_params.add_argument("--vr_batch_size", type=int, default=4, help=vr_batch_size_help)
+    vr_params.add_argument("--vr_window_size", type=int, default=512, help=vr_window_size_help)
+    vr_params.add_argument("--vr_aggression", type=int, default=5, help=vr_aggression_help)
+    vr_params.add_argument("--vr_enable_tta", action="store_true", help=vr_enable_tta_help)
+    vr_params.add_argument("--vr_high_end_process", action="store_true", help=vr_high_end_process_help)
+    vr_params.add_argument("--vr_enable_post_process", action="store_true", help=vr_enable_post_process_help)
+    vr_params.add_argument("--vr_post_process_threshold", type=float, default=0.2, help=vr_post_process_threshold_help)
+
+    demucs_segment_size_help = "size of segments into which the audio is split, 1-100. higher = slower but better quality (default: %(default)s). Example: --demucs_segment_size=256"
+    demucs_shifts_help = "number of predictions with random shifts, higher = slower but better quality (default: %(default)s). Example: --demucs_shifts=4"
+    demucs_overlap_help = "overlap between prediction windows, 0.001-0.999. higher = slower but better quality (default: %(default)s). Example: --demucs_overlap=0.25"
+    demucs_segments_enabled_help = "enable segment-wise processing (default: %(default)s). Example: --demucs_segments_enabled=False"
 
     demucs_params = parser.add_argument_group("Demucs Architecture Parameters")
-    demucs_params.add_argument(
-        "--demucs_segment_size",
-        type=str,
-        default="Default",
-        help="size of segments into which the audio is split, 1-100. higher = slower but better quality (default: %(default)s). Example: --demucs_segment_size=256",
-    )
-    demucs_params.add_argument(
-        "--demucs_shifts", type=int, default=2, help="number of predictions with random shifts, higher = slower but better quality (default: %(default)s). Example: --demucs_shifts=4"
-    )
-    demucs_params.add_argument(
-        "--demucs_overlap", type=float, default=0.25, help="overlap between prediction windows, 0.001-0.999. higher = slower but better quality (default: %(default)s). Example: --demucs_overlap=0.25"
+    demucs_params.add_argument("--demucs_segment_size", type=str, default="Default", help=demucs_segment_size_help)
+    demucs_params.add_argument("--demucs_shifts", type=int, default=2, help=demucs_shifts_help)
+    demucs_params.add_argument("--demucs_overlap", type=float, default=0.25, help=demucs_overlap_help)
+    demucs_params.add_argument("--demucs_segments_enabled", type=bool, default=True, help=demucs_segments_enabled_help)
+
+    mdxc_segment_size_help = "larger consumes more resources, but may give better results (default: %(default)s). Example: --mdxc_segment_size=256"
+    mdxc_use_model_segment_size_help = "use model default segment size instead of the value from the config file. Example: --mdxc_use_model_segment_size"
+    mdxc_overlap_help = "amount of overlap between prediction windows, 2-50. higher is better but slower (default: %(default)s). Example: --mdxc_overlap=8"
+    mdxc_batch_size_help = "larger consumes more RAM but may process slightly faster (default: %(default)s). Example: --mdxc_batch_size=4"
+    mdxc_pitch_shift_help = (
+        "shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals. (default: %(default)s). Example: --mdxc_pitch_shift=2"
     )
-    demucs_params.add_argument("--demucs_segments_enabled", type=bool, default=True, help="enable segment-wise processing (default: %(default)s). Example: --demucs_segments_enabled=False")
+
+    mdxc_params = parser.add_argument_group("MDXC Architecture Parameters")
+    mdxc_params.add_argument("--mdxc_segment_size", type=int, default=256, help=mdxc_segment_size_help)
+    mdxc_params.add_argument("--mdxc_use_model_segment_size", action="store_true", help=mdxc_use_model_segment_size_help)
+    mdxc_params.add_argument("--mdxc_overlap", type=int, default=8, help=mdxc_overlap_help)
+    mdxc_params.add_argument("--mdxc_batch_size", type=int, default=1, help=mdxc_batch_size_help)
+    mdxc_params.add_argument("--mdxc_pitch_shift", type=int, default=0, help=mdxc_pitch_shift_help)
 
     args = parser.parse_args()
 
@@ -94,27 +124,20 @@ def main():
     logger.setLevel(log_level)
 
     if args.env_info:
-        from audio_separator.separator import Separator
-
         separator = Separator()
-        exit(0)
+        sys.exit(0)
 
     if args.list_models:
-        from audio_separator.separator import Separator
-
         separator = Separator()
         print(json.dumps(separator.list_supported_model_files(), indent=4, sort_keys=True))
-        exit(0)
+        sys.exit(0)
 
     if not hasattr(args, "audio_file"):
         parser.print_help()
-        exit(1)
+        sys.exit(1)
 
     logger.info(f"Separator version {package_version} beginning with input file: {args.audio_file}")
 
-    # Deliberately import here to avoid loading slow dependencies when just running --help
-    from audio_separator.separator import Separator
-
     separator = Separator(
         log_formatter=log_formatter,
         log_level=log_level,
@@ -141,11 +164,13 @@ def main():
             "post_process_threshold": args.vr_post_process_threshold,
             "high_end_process": args.vr_high_end_process,
         },
-        demucs_params={
-            "segment_size": args.demucs_segment_size,
-            "shifts": args.demucs_shifts,
-            "overlap": args.demucs_overlap,
-            "segments_enabled": args.demucs_segments_enabled,
+        demucs_params={"segment_size": args.demucs_segment_size, "shifts": args.demucs_shifts, "overlap": args.demucs_overlap, "segments_enabled": args.demucs_segments_enabled},
+        mdxc_params={
+            "segment_size": args.mdxc_segment_size,
+            "batch_size": args.mdxc_batch_size,
+            "overlap": args.mdxc_overlap,
+            "use_model_segment_size": args.mdxc_use_model_segment_size,
+            "pitch_shift": args.mdxc_pitch_shift,
         },
     )
 
diff --git a/poetry.lock b/poetry.lock
index dd0aec5..89ae82a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,15 @@
-# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+
+[[package]]
+name = "absl-py"
+version = "2.1.0"
+description = "Abseil Python Common Libraries, see https://github.com/abseil/abseil-py."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "absl-py-2.1.0.tar.gz", hash = "sha256:7820790efbb316739cde8b4e19357243fc3608a152024288513dd968d7d959ff"},
+    {file = "absl_py-2.1.0-py3-none-any.whl", hash = "sha256:526a04eadab8b4ee719ce68f204172ead1027549089702d99b9059f129ff1308"},
+]
 
 [[package]]
 name = "audioread"
@@ -276,6 +287,17 @@ humanfriendly = ">=9.1"
 [package.extras]
 cron = ["capturer (>=2.4)"]
 
+[[package]]
+name = "contextlib2"
+version = "21.6.0"
+description = "Backports and enhancements for the contextlib module"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "contextlib2-21.6.0-py2.py3-none-any.whl", hash = "sha256:3fbdb64466afd23abaf6c977627b75b6139a5a3e8ce38405c5b413aed7a0471f"},
+    {file = "contextlib2-21.6.0.tar.gz", hash = "sha256:ab1e2bfe1d01d968e1b7e8d9023bc51ef3509bba217bb730cee3827e1ee82869"},
+]
+
 [[package]]
 name = "coverage"
 version = "7.4.3"
@@ -777,6 +799,22 @@ files = [
     {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"},
 ]
 
+[[package]]
+name = "ml-collections"
+version = "0.1.1"
+description = "ML Collections is a library of Python collections designed for ML usecases."
+optional = false
+python-versions = ">=2.6"
+files = [
+    {file = "ml_collections-0.1.1.tar.gz", hash = "sha256:3fefcc72ec433aa1e5d32307a3e474bbb67f405be814ea52a2166bfc9dbe68cc"},
+]
+
+[package.dependencies]
+absl-py = "*"
+contextlib2 = "*"
+PyYAML = "*"
+six = "*"
+
 [[package]]
 name = "mpmath"
 version = "1.3.0"
@@ -1241,13 +1279,13 @@ sympy = "*"
 
 [[package]]
 name = "packaging"
-version = "23.2"
+version = "24.0"
 description = "Core utilities for Python packages"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"},
-    {file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"},
+    {file = "packaging-24.0-py3-none-any.whl", hash = "sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5"},
+    {file = "packaging-24.0.tar.gz", hash = "sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9"},
 ]
 
 [[package]]
@@ -1571,6 +1609,26 @@ urllib3 = ">=1.21.1,<3"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
+[[package]]
+name = "resampy"
+version = "0.4.3"
+description = "Efficient signal resampling"
+optional = false
+python-versions = "*"
+files = [
+    {file = "resampy-0.4.3-py3-none-any.whl", hash = "sha256:ad2ed64516b140a122d96704e32bc0f92b23f45419e8b8f478e5a05f83edcebd"},
+    {file = "resampy-0.4.3.tar.gz", hash = "sha256:a0d1c28398f0e55994b739650afef4e3974115edbe96cd4bb81968425e916e47"},
+]
+
+[package.dependencies]
+numba = ">=0.53"
+numpy = ">=1.17"
+
+[package.extras]
+design = ["optuna (>=2.10.0)"]
+docs = ["numpydoc", "sphinx (!=1.3.1)"]
+tests = ["pytest (<8)", "pytest-cov", "scipy (>=1.1)"]
+
 [[package]]
 name = "samplerate"
 version = "0.1.0"
@@ -1964,4 +2022,4 @@ gpu = ["onnxruntime-gpu"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9"
-content-hash = "a82cdc2ee334a15ec0e67d681be9dece0842917834e900252a00acca120ecbcf"
+content-hash = "60846f7b9c5d3909a982b949406ea9b9953ee132e3b77c20710acac20f503ebb"
diff --git a/pyproject.toml b/pyproject.toml
index 9581b93..6db1478 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "audio-separator"
-version = "0.15.3"
+version = "0.16.0"
 description = "Easy to use audio stem separation, using various models from UVR trained primarily by @Anjok07"
 authors = ["Andrew Beveridge <andrew@beveridge.uk>"]
 license = "MIT"
@@ -45,6 +45,8 @@ julius = ">=0.2"
 diffq = ">=0.2"
 einops = ">=0.7"
 pyyaml = "*"
+ml_collections = "*"
+resampy = ">=0.4"
 
 [tool.poetry.extras]
 cpu = ["onnxruntime"]
diff --git a/tests/TODO.txt b/tests/TODO.txt
new file mode 100644
index 0000000..6375b48
--- /dev/null
+++ b/tests/TODO.txt
@@ -0,0 +1,9 @@
+- Test running CLI with minimal input file for each major supported model (e.g. at least 1 from each architecture) outputs expected files
+- Test running CLI with pre-warmed cache directory containing model files does not repeat download
+- Test running CLI with corrupt model files in cache directory throws expected error
+- Test loading the separation class rather than using the CLI works as expected with all major supported models
+- Test processing multiple files in a row outputs separate expected files
+- Test processing file with multiple different models outputs separate expected files
+- Test each of the architecure specific parameters works as expected in both CLI and class mode
+- Generate oscillogram and spectrogram of model output for a short test file for each major supported model and compare to expected output to ensure separation is actually separating stems
+- Add a few different test files with different properties, e.g. background noise, stems present, or genre of music and ensure separation works as expected for each
\ No newline at end of file