diff --git a/.gitignore b/.gitignore
index 10d6a9f..23d1136 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,8 @@
 /tracks/
 /lyrics/
 /.cache/
-/models/*.onnx
+*.onnx
+*.pth
 *.wav
 *.flac
 *.mp3
diff --git a/README.md b/README.md
index 10759fd..92ffd88 100644
--- a/README.md
+++ b/README.md
@@ -168,10 +168,10 @@ separator = Separator()
 separator.load_model()
 
 # Perform the separation on specific audio files without reloading the model
-primary_stem_path, secondary_stem_path = separator.separate('audio1.wav')
+primary_stem_output_path, secondary_stem_output_path = separator.separate('audio1.wav')
 
-print(f'Primary stem saved at {primary_stem_path}')
-print(f'Secondary stem saved at {secondary_stem_path}')
+print(f'Primary stem saved at {primary_stem_output_path}')
+print(f'Secondary stem saved at {secondary_stem_output_path}')
 ```
 
 #### Batch processing, or processing with multiple models
@@ -212,7 +212,7 @@ output_file_paths_6 = separator.separate('audio3.wav')
 - model_file_dir: (Optional) Directory to cache model files in. Default: /tmp/audio-separator-models/
 - output_dir: (Optional) Directory where the separated files will be saved. If not specified, outputs to current dir.
 - output_format: (Optional) Format to encode output files, any common format (WAV, MP3, FLAC, M4A, etc.). Default: WAV
-- denoise_enabled: (Optional) Flag to enable or disable denoising as part of the separation process. Default: True
+- enable_denoise: (Optional) Flag to enable or disable denoising as part of the separation process. Default: True
 - normalization_enabled: (Optional) Flag to enable or disable normalization as part of the separation process. Default: False
 - output_single_stem: (Optional) Output only single stem, either instrumental or vocals.
 - invert_secondary_stem_using_spectogram=True,
diff --git a/audio_separator/separator/architectures/__init__.py b/audio_separator/separator/architectures/__init__.py
new file mode 100644
index 0000000..b76cc7d
--- /dev/null
+++ b/audio_separator/separator/architectures/__init__.py
@@ -0,0 +1,3 @@
+from .mdx_separator import MDXSeparator
+from .vr_separator import VRSeparator
+
diff --git a/audio_separator/separator/architectures/mdx_separator.py b/audio_separator/separator/architectures/mdx_separator.py
new file mode 100644
index 0000000..a8acc41
--- /dev/null
+++ b/audio_separator/separator/architectures/mdx_separator.py
@@ -0,0 +1,426 @@
+"""Module for separating audio sources using MDX architecture models."""
+
+import os
+import torch
+import librosa
+import onnxruntime as ort
+import numpy as np
+import onnx2torch
+from tqdm import tqdm
+from audio_separator.separator.uvr_lib_v5 import spec_utils
+from audio_separator.separator.uvr_lib_v5.stft import STFT
+from audio_separator.separator.common_separator import CommonSeparator
+
+
+class MDXSeparator(CommonSeparator):
+    """
+    MDXSeparator is responsible for separating audio sources using MDX models.
+    It initializes with configuration parameters and prepares the model for separation tasks.
+    """
+
+    def __init__(self, common_config, arch_config):
+        super().__init__(config=common_config)
+
+        self.hop_length = arch_config.get("hop_length")
+        self.segment_size = arch_config.get("segment_size")
+        self.overlap = arch_config.get("overlap")
+
+        # Initializing model parameters
+        self.compensate = self.model_data["compensate"]
+        self.dim_f = self.model_data["mdx_dim_f_set"]
+        self.dim_t = 2 ** self.model_data["mdx_dim_t_set"]
+        self.n_fft = self.model_data["mdx_n_fft_scale_set"]
+
+        self.config_yaml = self.model_data.get("config_yaml", None)
+
+        # Number of batches to be processed at a time.
+        # - Higher values mean more RAM usage but slightly faster processing times.
+        # - Lower values mean less RAM usage but slightly longer processing times.
+        # - Batch size value has no effect on output quality.
+        # BATCH_SIZE = ('1', ''2', '3', '4', '5', '6', '7', '8', '9', '10')
+        self.batch_size = arch_config.get("batch_size", 1)
+
+        self.logger.debug(f"Model params: primary_stem={self.primary_stem_name}, secondary_stem={self.secondary_stem_name}")
+        self.logger.debug(f"Model params: batch_size={self.batch_size}, compensate={self.compensate}, segment_size={self.segment_size}, dim_f={self.dim_f}, dim_t={self.dim_t}")
+        self.logger.debug(f"Model params: n_fft={self.n_fft}, hop={self.hop_length}")
+
+        # Loading the model for inference
+        self.logger.debug("Loading ONNX model for inference...")
+        if self.segment_size == self.dim_t:
+            ort_inference_session = ort.InferenceSession(self.model_path, providers=self.onnx_execution_provider)
+            self.model_run = lambda spek: ort_inference_session.run(None, {"input": spek.cpu().numpy()})[0]
+            self.logger.debug("Model loaded successfully using ONNXruntime inferencing session.")
+        else:
+            self.model_run = onnx2torch.convert(self.model_path)
+            self.model_run.to(self.torch_device).eval()
+            self.logger.warning("Model converted from onnx to pytorch due to segment size not matching dim_t, processing may be slower.")
+
+        self.n_bins = 0
+        self.trim = 0
+        self.chunk_size = 0
+        self.gen_size = 0
+        self.stft = None
+
+        self.primary_source = None
+        self.secondary_source = None
+        self.audio_file_path = None
+        self.audio_file_base = None
+        self.secondary_source_map = None
+        self.primary_source_map = None
+
+    def separate(self, audio_file_path):
+        """
+        Separates the audio file into primary and secondary sources based on the model's configuration.
+        It processes the mix, demixes it into sources, normalizes the sources, and saves the output files.
+
+        Args:
+            audio_file_path (str): The path to the audio file to be processed.
+
+        Returns:
+            list: A list of paths to the output files generated by the separation process.
+        """
+        self.primary_source = None
+        self.secondary_source = None
+
+        self.audio_file_path = audio_file_path
+        self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0]
+
+        # Prepare the mix for processing
+        self.logger.debug("Preparing mix...")
+        mix = self.prepare_mix(self.audio_file_path)
+
+        self.logger.debug("Normalizing mix before demixing...")
+        mix = spec_utils.normalize(wave=mix, max_peak=self.normalization_threshold)
+
+        # Start the demixing process
+        source = self.demix(mix)
+        self.logger.debug("Demixing completed.")
+
+        # In UVR, the source is cached here if it's a vocal split model, but we're not supporting that yet
+
+        # Initialize the list for output files
+        output_files = []
+        self.logger.debug("Processing output files...")
+
+        # Normalize and transpose the primary source if it's not already an array
+        if not isinstance(self.primary_source, np.ndarray):
+            self.logger.debug("Normalizing primary source...")
+            self.primary_source = spec_utils.normalize(wave=source, max_peak=self.normalization_threshold).T
+
+        # Process the secondary source if not already an array
+        if not isinstance(self.secondary_source, np.ndarray):
+            self.logger.debug("Producing secondary source: demixing in match_mix mode")
+            raw_mix = self.demix(mix, is_match_mix=True)
+
+            if self.invert_using_spec:
+                self.logger.debug("Inverting secondary stem using spectogram as invert_using_spec is set to True")
+                self.secondary_source = spec_utils.invert_stem(raw_mix, source)
+            else:
+                self.logger.debug("Inverting secondary stem by subtracting of transposed demixed stem from transposed original mix")
+                self.secondary_source = mix.T - source.T
+
+        # Save and process the secondary stem if needed
+        if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower():
+            self.logger.info(f"Saving {self.secondary_stem_name} stem...")
+            if not self.secondary_stem_output_path:
+                self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}")
+            self.secondary_source_map = self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name)
+            output_files.append(self.secondary_stem_output_path)
+
+        # Save and process the primary stem if needed
+        if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower():
+            self.logger.info(f"Saving {self.primary_stem_name} stem...")
+            if not self.primary_stem_output_path:
+                self.primary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}")
+            if not isinstance(self.primary_source, np.ndarray):
+                self.primary_source = source.T
+            self.primary_source_map = self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name)
+            output_files.append(self.primary_stem_output_path)
+
+        # Not yet implemented from UVR features:
+        # self.process_vocal_split_chain(secondary_sources)
+        # self.logger.debug("Vocal split chain processed.")
+
+        return output_files
+
+    def initialize_model_settings(self):
+        """
+        This function sets up the necessary parameters for the model, like the number of frequency bins (n_bins), the trimming size (trim),
+        the size of each audio chunk (chunk_size), and the window function for spectral transformations (window).
+        It ensures that the model is configured with the correct settings for processing the audio data.
+        """
+        self.logger.debug("Initializing model settings...")
+
+        # n_bins is half the FFT size plus one (self.n_fft // 2 + 1).
+        self.n_bins = self.n_fft // 2 + 1
+
+        # trim is half the FFT size (self.n_fft // 2).
+        self.trim = self.n_fft // 2
+
+        # chunk_size is the hop_length size times the segment size minus one
+        self.chunk_size = self.hop_length * (self.segment_size - 1)
+
+        # gen_size is the chunk size minus twice the trim size
+        self.gen_size = self.chunk_size - 2 * self.trim
+
+        self.stft = STFT(self.logger, self.n_fft, self.hop_length, self.dim_f, self.torch_device)
+
+        self.logger.debug(f"Model input params: n_fft={self.n_fft} hop_length={self.hop_length} dim_f={self.dim_f}")
+        self.logger.debug(f"Model settings: n_bins={self.n_bins}, trim={self.trim}, chunk_size={self.chunk_size}, gen_size={self.gen_size}")
+
+    def initialize_mix(self, mix, is_ckpt=False):
+        """
+        After prepare_mix segments the audio, initialize_mix further processes each segment.
+        It ensures each audio segment is in the correct format for the model, applies necessary padding,
+        and converts the segments into tensors for processing with the model.
+        This step is essential for preparing the audio data in a format that the neural network can process.
+        """
+        # Log the initialization of the mix and whether checkpoint mode is used
+        self.logger.debug(f"Initializing mix with is_ckpt={is_ckpt}. Initial mix shape: {mix.shape}")
+
+        # Ensure the mix is a 2-channel (stereo) audio signal
+        if mix.shape[0] != 2:
+            error_message = f"Expected a 2-channel audio signal, but got {mix.shape[0]} channels"
+            self.logger.error(error_message)
+            raise ValueError(error_message)
+
+        # If in checkpoint mode, process the mix differently
+        if is_ckpt:
+            self.logger.debug("Processing in checkpoint mode...")
+            # Calculate padding based on the generation size and trim
+            pad = self.gen_size + self.trim - (mix.shape[-1] % self.gen_size)
+            self.logger.debug(f"Padding calculated: {pad}")
+            # Add padding at the beginning and the end of the mix
+            mixture = np.concatenate((np.zeros((2, self.trim), dtype="float32"), mix, np.zeros((2, pad), dtype="float32")), 1)
+            # Determine the number of chunks based on the mixture's length
+            num_chunks = mixture.shape[-1] // self.gen_size
+            self.logger.debug(f"Mixture shape after padding: {mixture.shape}, Number of chunks: {num_chunks}")
+            # Split the mixture into chunks
+            mix_waves = [mixture[:, i * self.gen_size : i * self.gen_size + self.chunk_size] for i in range(num_chunks)]
+        else:
+            # If not in checkpoint mode, process normally
+            self.logger.debug("Processing in non-checkpoint mode...")
+            mix_waves = []
+            n_sample = mix.shape[1]
+            # Calculate necessary padding to make the total length divisible by the generation size
+            pad = self.gen_size - n_sample % self.gen_size
+            self.logger.debug(f"Number of samples: {n_sample}, Padding calculated: {pad}")
+            # Apply padding to the mix
+            mix_p = np.concatenate((np.zeros((2, self.trim)), mix, np.zeros((2, pad)), np.zeros((2, self.trim))), 1)
+            self.logger.debug(f"Shape of mix after padding: {mix_p.shape}")
+
+            # Process the mix in chunks
+            i = 0
+            while i < n_sample + pad:
+                waves = np.array(mix_p[:, i : i + self.chunk_size])
+                mix_waves.append(waves)
+                self.logger.debug(f"Processed chunk {len(mix_waves)}: Start {i}, End {i + self.chunk_size}")
+                i += self.gen_size
+
+        # Convert the list of wave chunks into a tensor for processing on the specified device
+        mix_waves_tensor = torch.tensor(mix_waves, dtype=torch.float32).to(self.torch_device)
+        self.logger.debug(f"Converted mix_waves to tensor. Tensor shape: {mix_waves_tensor.shape}")
+
+        return mix_waves_tensor, pad
+
+    def demix(self, mix, is_match_mix=False):
+        """
+        Demixes the input mix into its constituent sources. If is_match_mix is True, the function adjusts the processing
+        to better match the mix, affecting chunk sizes and overlaps. The demixing process involves padding the mix,
+        processing it in chunks, applying windowing for overlaps, and accumulating the results to separate the sources.
+        """
+        self.logger.debug(f"Starting demixing process with is_match_mix: {is_match_mix}...")
+        self.initialize_model_settings()
+
+        # Preserves the original mix for later use.
+        # In UVR, this is used for the pitch fix and VR denoise processes, which aren't yet implemented here.
+        org_mix = mix
+        self.logger.debug(f"Original mix stored. Shape: {org_mix.shape}")
+
+        # Initializes a list to store the separated waveforms.
+        tar_waves_ = []
+
+        # Handling different chunk sizes and overlaps based on the matching requirement.
+        if is_match_mix:
+            # Sets a smaller chunk size specifically for matching the mix.
+            chunk_size = self.hop_length * (self.segment_size - 1)
+            # Sets a small overlap for the chunks.
+            overlap = 0.02
+            self.logger.debug(f"Chunk size for matching mix: {chunk_size}, Overlap: {overlap}")
+        else:
+            # Uses the regular chunk size defined in model settings.
+            chunk_size = self.chunk_size
+            # Uses the overlap specified in the model settings.
+            overlap = self.overlap
+            self.logger.debug(f"Standard chunk size: {chunk_size}, Overlap: {overlap}")
+
+        # Calculates the generated size after subtracting the trim from both ends of the chunk.
+        gen_size = chunk_size - 2 * self.trim
+        self.logger.debug(f"Generated size calculated: {gen_size}")
+
+        # Calculates padding to make the mix length a multiple of the generated size.
+        pad = gen_size + self.trim - ((mix.shape[-1]) % gen_size)
+        # Prepares the mixture with padding at the beginning and the end.
+        mixture = np.concatenate((np.zeros((2, self.trim), dtype="float32"), mix, np.zeros((2, pad), dtype="float32")), 1)
+        self.logger.debug(f"Mixture prepared with padding. Mixture shape: {mixture.shape}")
+
+        # Calculates the step size for processing chunks based on the overlap.
+        step = int((1 - overlap) * chunk_size)
+        self.logger.debug(f"Step size for processing chunks: {step} as overlap is set to {overlap}.")
+
+        # Initializes arrays to store the results and to account for overlap.
+        result = np.zeros((1, 2, mixture.shape[-1]), dtype=np.float32)
+        divider = np.zeros((1, 2, mixture.shape[-1]), dtype=np.float32)
+
+        # Initializes counters for processing chunks.
+        total = 0
+        total_chunks = (mixture.shape[-1] + step - 1) // step
+        self.logger.debug(f"Total chunks to process: {total_chunks}")
+
+        # Processes each chunk of the mixture.
+        for i in tqdm(range(0, mixture.shape[-1], step)):
+            total += 1
+            start = i
+            end = min(i + chunk_size, mixture.shape[-1])
+            self.logger.debug(f"Processing chunk {total}/{total_chunks}: Start {start}, End {end}")
+
+            # Handles windowing for overlapping chunks.
+            chunk_size_actual = end - start
+            window = None
+            if overlap != 0:
+                window = np.hanning(chunk_size_actual)
+                window = np.tile(window[None, None, :], (1, 2, 1))
+                self.logger.debug("Window applied to the chunk.")
+
+            # Zero-pad the chunk to prepare it for processing.
+            mix_part_ = mixture[:, start:end]
+            if end != i + chunk_size:
+                pad_size = (i + chunk_size) - end
+                mix_part_ = np.concatenate((mix_part_, np.zeros((2, pad_size), dtype="float32")), axis=-1)
+
+            # Converts the chunk to a tensor for processing.
+            mix_part = torch.tensor([mix_part_], dtype=torch.float32).to(self.torch_device)
+            # Splits the chunk into smaller batches if necessary.
+            mix_waves = mix_part.split(self.batch_size)
+            total_batches = len(mix_waves)
+            self.logger.debug(f"Mix part split into batches. Number of batches: {total_batches}")
+
+            with torch.no_grad():
+                # Processes each batch in the chunk.
+                batches_processed = 0
+                for mix_wave in mix_waves:
+                    batches_processed += 1
+                    self.logger.debug(f"Processing mix_wave batch {batches_processed}/{total_batches}")
+
+                    # Runs the model to separate the sources.
+                    tar_waves = self.run_model(mix_wave, is_match_mix=is_match_mix)
+
+                    # Applies windowing if needed and accumulates the results.
+                    if window is not None:
+                        tar_waves[..., :chunk_size_actual] *= window
+                        divider[..., start:end] += window
+                    else:
+                        divider[..., start:end] += 1
+
+                    result[..., start:end] += tar_waves[..., : end - start]
+
+        # Normalizes the results by the divider to account for overlap.
+        self.logger.debug("Normalizing result by dividing result by divider.")
+        tar_waves = result / divider
+        tar_waves_.append(tar_waves)
+
+        # Reshapes the results to match the original dimensions.
+        tar_waves_ = np.vstack(tar_waves_)[:, :, self.trim : -self.trim]
+        tar_waves = np.concatenate(tar_waves_, axis=-1)[:, : mix.shape[-1]]
+
+        # Extracts the source from the results.
+        source = tar_waves[:, 0:None]
+        self.logger.debug(f"Concatenated tar_waves. Shape: {tar_waves.shape}")
+
+        # TODO: In UVR, pitch changing happens here. Consider implementing this as a feature.
+
+        # Compensates the source if not matching the mix.
+        if not is_match_mix:
+            source *= self.compensate
+            self.logger.debug("Match mix mode; compensate multiplier applied.")
+
+        # TODO: In UVR, VR denoise model gets applied here. Consider implementing this as a feature.
+
+        self.logger.debug("Demixing process completed.")
+        return source
+
+    def run_model(self, mix, is_match_mix=False):
+        """
+        Processes the input mix through the model to separate the sources.
+        Applies STFT, handles spectrum modifications, and runs the model for source separation.
+        """
+        # Applying the STFT to the mix. The mix is moved to the specified device (e.g., GPU) before processing.
+        # self.logger.debug(f"Running STFT on the mix. Mix shape before STFT: {mix.shape}")
+        spek = self.stft(mix.to(self.torch_device))
+        self.logger.debug(f"STFT applied on mix. Spectrum shape: {spek.shape}")
+
+        # Zeroing out the first 3 bins of the spectrum. This is often done to reduce low-frequency noise.
+        spek[:, :, :3, :] *= 0
+        # self.logger.debug("First 3 bins of the spectrum zeroed out.")
+
+        # Handling the case where the mix needs to be matched (is_match_mix = True)
+        if is_match_mix:
+            # self.logger.debug("Match mix mode is enabled. Converting spectrum to NumPy array.")
+            spec_pred = spek.cpu().numpy()
+            self.logger.debug("is_match_mix: spectrum prediction obtained directly from STFT output.")
+        else:
+            # If denoising is enabled, the model is run on both the negative and positive spectrums.
+            if self.enable_denoise:
+                # Assuming spek is a tensor and self.model_run can process it directly
+                spec_pred_neg = self.model_run(-spek)  # Ensure this line correctly negates spek and runs the model
+                spec_pred_pos = self.model_run(spek)
+                # Ensure both spec_pred_neg and spec_pred_pos are tensors before applying operations
+                spec_pred = (spec_pred_neg * -0.5) + (spec_pred_pos * 0.5)  # [invalid-unary-operand-type]
+                self.logger.debug("Model run on both negative and positive spectrums for denoising.")
+            else:
+                spec_pred = self.model_run(spek)
+                self.logger.debug("Model run on the spectrum without denoising.")
+
+        # Applying the inverse STFT to convert the spectrum back to the time domain.
+        result = self.stft.inverse(torch.tensor(spec_pred).to(self.torch_device)).cpu().detach().numpy()
+        self.logger.debug(f"Inverse STFT applied. Returning result with shape: {result.shape}")
+
+        return result
+
+    def prepare_mix(self, mix):
+        """
+        Prepares the mix for processing. This includes loading the audio from a file if necessary,
+        ensuring the mix is in the correct format, and converting mono to stereo if needed.
+        """
+        # Store the original path or the mix itself for later checks
+        audio_path = mix
+
+        # Check if the input is a file path (string) and needs to be loaded
+        if not isinstance(mix, np.ndarray):
+            self.logger.debug(f"Loading audio from file: {mix}")
+            mix, sr = librosa.load(mix, mono=False, sr=self.sample_rate)
+            self.logger.debug(f"Audio loaded. Sample rate: {sr}, Audio shape: {mix.shape}")
+        else:
+            # Transpose the mix if it's already an ndarray (expected shape: [channels, samples])
+            self.logger.debug("Transposing the provided mix array.")
+            mix = mix.T
+            self.logger.debug(f"Transposed mix shape: {mix.shape}")
+
+        # If the original input was a filepath, check if the loaded mix is empty
+        if isinstance(audio_path, str):
+            if not np.any(mix):
+                error_msg = f"Audio file {audio_path} is empty or not valid"
+                self.logger.error(error_msg)
+                raise ValueError(error_msg)
+            else:
+                self.logger.debug("Audio file is valid and contains data.")
+
+        # Ensure the mix is in stereo format
+        if mix.ndim == 1:
+            self.logger.debug("Mix is mono. Converting to stereo.")
+            mix = np.asfortranarray([mix, mix])
+            self.logger.debug("Converted to stereo mix.")
+
+        # Final log indicating successful preparation of the mix
+        self.logger.debug("Mix preparation completed.")
+        return mix
diff --git a/audio_separator/separator/architectures/vr_separator.py b/audio_separator/separator/architectures/vr_separator.py
new file mode 100644
index 0000000..2bb0cf3
--- /dev/null
+++ b/audio_separator/separator/architectures/vr_separator.py
@@ -0,0 +1,337 @@
+"""Module for separating audio sources using VR architecture models."""
+
+import os
+import math
+
+import torch
+import librosa
+import numpy as np
+from tqdm import tqdm
+
+# Check if we really need the rerun_mp3 function, remove if not
+import audioread
+
+from audio_separator.separator.common_separator import CommonSeparator
+from audio_separator.separator.uvr_lib_v5 import spec_utils
+from audio_separator.separator.uvr_lib_v5.vr_network import nets
+from audio_separator.separator.uvr_lib_v5.vr_network import nets_new
+from audio_separator.separator.uvr_lib_v5.vr_network.model_param_init import ModelParameters
+
+
+class VRSeparator(CommonSeparator):
+    """
+    VRSeparator is responsible for separating audio sources using VR models.
+    It initializes with configuration parameters and prepares the model for separation tasks.
+    """
+
+    def __init__(self, common_config, arch_config: dict):
+        # Any configuration values which can be shared between architectures should be set already in CommonSeparator,
+        # e.g. user-specified functionality choices (self.output_single_stem) or common model parameters (self.primary_stem_name)
+        super().__init__(config=common_config)
+
+        # Model data is basic overview metadata about the model, e.g. which stem is primary and whether it's a karaoke model
+        # It's loaded in from model_data_new.json in Separator.load_model and there are JSON examples in that method
+        # The instance variable self.model_data is passed through from Separator and set in CommonSeparator
+        self.logger.debug(f"Model data: {self.model_data}")
+
+        # Most of the VR models use the same number of output channels, but the VR 51 models have specific values set in model_data JSON
+        self.model_capacity = 32, 128
+        self.is_vr_51_model = False
+
+        if "nout" in self.model_data.keys() and "nout_lstm" in self.model_data.keys():
+            self.model_capacity = self.model_data["nout"], self.model_data["nout_lstm"]
+            self.is_vr_51_model = True
+
+        # Model params are additional technical parameter values from JSON files in separator/uvr_lib_v5/vr_network/modelparams/*.json,
+        # with filenames referenced by the model_data["vr_model_param"] value
+        package_root_filepath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        vr_params_json_dir = os.path.join(package_root_filepath, "uvr_lib_v5", "vr_network", "modelparams")
+        vr_params_json_filename = f"{self.model_data['vr_model_param']}.json"
+        vr_params_json_filepath = os.path.join(vr_params_json_dir, vr_params_json_filename)
+        self.model_params = ModelParameters(vr_params_json_filepath)
+
+        self.logger.debug(f"Model params: {self.model_params.param}")
+
+        # Arch Config is the VR architecture specific user configuration options, which should all be configurable by the user
+        # either by their Separator class instantiation or by passing in a CLI parameter.
+        # While there are similarities between architectures for some of these (e.g. batch_size), they are deliberately configured
+        # this way as they have architecture-specific default values.
+
+        # This option performs Test-Time-Augmentation to improve the separation quality.
+        # Note: Having this selected will increase the time it takes to complete a conversion
+        self.enable_tta = arch_config.get("enable_tta", False)
+
+        # This option can potentially identify leftover instrumental artifacts within the vocal outputs. \nThis option may improve the separation of some songs.
+        # Note: Selecting this option can adversely affect the conversion process, depending on the track. Because of this, it is only recommended as a last resort.
+        self.enable_post_process = arch_config.get("enable_post_process", False)
+
+        # post_process_threshold values = ('0.1', '0.2', '0.3')
+        self.post_process_threshold = arch_config.get("post_process_threshold", 0.2)
+
+        # Number of batches to be processed at a time.
+        # - Higher values mean more RAM usage but slightly faster processing times.
+        # - Lower values mean less RAM usage but slightly longer processing times.
+        # - Batch size value has no effect on output quality.
+
+        # Andrew note: for some reason, lower batch sizes seem to cause broken output for VR arch; need to investigate why
+        self.batch_size = arch_config.get("batch_size", 16)
+
+        # 'Select window size to balance quality and speed:\n\n'
+        # '• 1024 - Quick but lesser quality.\n'
+        # '• 512 - Medium speed and quality.\n'
+        # '• 320 - Takes longer but may offer better quality.'
+        self.window_size = arch_config.get("window_size", 512)
+
+        # The application will mirror the missing frequency range of the output.
+        self.high_end_process = arch_config.get("high_end_process", False)
+        self.input_high_end_h = None
+        self.input_high_end = None
+
+        # Adjust the intensity of primary stem extraction:
+        # - Ranges from -100 - 100.
+        # - Bigger values mean deeper extractions.
+        # - Typically, it's set to 5 for vocals & instrumentals.
+        # - Values beyond 5 might muddy the sound for non-vocal models.
+        self.aggression = arch_config.get("aggression", 5)
+
+        self.aggressiveness = {"value": self.aggression, "split_bin": self.model_params.param["band"][1]["crop_stop"], "aggr_correction": self.model_params.param.get("aggr_correction")}
+
+        self.model_samplerate = self.model_params.param["sr"]
+
+        self.logger.debug(f"VR arch params: enable_tta={self.enable_tta}, enable_post_process={self.enable_post_process}, post_process_threshold={self.post_process_threshold}")
+        self.logger.debug(f"VR arch params: batch_size={self.batch_size}, window_size={self.window_size}")
+        self.logger.debug(f"VR arch params: high_end_process={self.high_end_process}, aggression={self.aggression}")
+        self.logger.debug(f"VR arch params: is_vr_51_model={self.is_vr_51_model}, model_samplerate={self.model_samplerate}, model_capacity={self.model_capacity}")
+
+        self.model_run = lambda *args, **kwargs: self.logger.error("Model run method is not initialised yet.")
+
+        # This should go away once we refactor to remove soundfile.write and replace with pydub like we did for the MDX rewrite
+        self.wav_subtype = "PCM_16"
+
+        self.logger.info("VR Separator initialisation complete")
+
+    def separate(self, audio_file_path):
+        """
+        Separates the audio file into primary and secondary sources based on the model's configuration.
+        It processes the mix, demixes it into sources, normalizes the sources, and saves the output files.
+
+        Args:
+            audio_file_path (str): The path to the audio file to be processed.
+
+        Returns:
+            list: A list of paths to the output files generated by the separation process.
+        """
+        self.primary_source = None
+        self.secondary_source = None
+
+        self.audio_file_path = audio_file_path
+        self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0]
+
+        self.logger.debug("Starting inference...")
+
+        nn_arch_sizes = [31191, 33966, 56817, 123821, 123812, 129605, 218409, 537238, 537227]  # default
+        vr_5_1_models = [56817, 218409]
+        model_size = math.ceil(os.stat(self.model_path).st_size / 1024)
+        nn_arch_size = min(nn_arch_sizes, key=lambda x: abs(x - model_size))
+        self.logger.debug(f"Model size determined: {model_size}, NN architecture size: {nn_arch_size}")
+
+        if nn_arch_size in vr_5_1_models or self.is_vr_51_model:
+            self.logger.debug("Using CascadedNet for VR 5.1 model...")
+            self.model_run = nets_new.CascadedNet(self.model_params.param["bins"] * 2, nn_arch_size, nout=self.model_capacity[0], nout_lstm=self.model_capacity[1])
+            self.is_vr_51_model = True
+        else:
+            self.logger.debug("Determining model capacity...")
+            self.model_run = nets.determine_model_capacity(self.model_params.param["bins"] * 2, nn_arch_size)
+
+        self.model_run.load_state_dict(torch.load(self.model_path, map_location=self.torch_device_cpu))
+        self.model_run.to(self.torch_device)
+        self.logger.debug("Model loaded and moved to device.")
+
+        y_spec, v_spec = self.inference_vr(self.loading_mix(), self.torch_device, self.aggressiveness)
+        self.logger.debug("Inference completed.")
+
+        # Not yet implemented from UVR features:
+        #
+        # if not self.is_vocal_split_model:
+        #     self.cache_source((y_spec, v_spec))
+
+        # if self.is_secondary_model_activated and self.secondary_model:
+        #     self.logger.debug("Processing secondary model...")
+        #     self.secondary_source_primary, self.secondary_source_secondary = process_secondary_model(
+        #         self.secondary_model, self.process_data, main_process_method=self.process_method, main_model_primary=self.primary_stem
+        #     )
+
+        # Initialize the list for output files
+        output_files = []
+        self.logger.debug("Processing output files...")
+
+        # Save and process the primary stem if needed
+        if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower():
+            self.logger.info(f"Saving {self.primary_stem_name} stem...")
+            if not self.primary_stem_output_path:
+                self.primary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}")
+
+            if not isinstance(self.primary_source, np.ndarray):
+                self.primary_source = self.spec_to_wav(y_spec).T
+                self.logger.debug("Converting primary source spectrogram to waveform.")
+                if not self.model_samplerate == 44100:
+                    self.primary_source = librosa.resample(self.primary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T
+                    self.logger.debug("Resampling primary source to 44100Hz.")
+
+            self.primary_source_map = self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name)
+            self.logger.debug("Primary stem processed.")
+            output_files.append(self.primary_stem_output_path)
+
+        # Save and process the secondary stem if needed
+        if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower():
+            self.logger.info(f"Saving {self.secondary_stem_name} stem...")
+            if not self.secondary_stem_output_path:
+                self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}")
+
+            self.logger.debug(f"Processing secondary stem: {self.secondary_stem_name}")
+            if not isinstance(self.secondary_source, np.ndarray):
+                self.secondary_source = self.spec_to_wav(v_spec).T
+                self.logger.debug("Converting secondary source spectrogram to waveform.")
+                if not self.model_samplerate == 44100:
+                    self.secondary_source = librosa.resample(self.secondary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T
+                    self.logger.debug("Resampling secondary source to 44100Hz.")
+
+            self.secondary_source_map = self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name)
+            self.logger.debug("Secondary stem processed.")
+            output_files.append(self.secondary_stem_output_path)
+
+        # Not yet implemented from UVR features:
+        # self.process_vocal_split_chain(secondary_sources)
+        # self.logger.debug("Vocal split chain processed.")
+
+        return output_files
+
+    def loading_mix(self):
+        X_wave, X_spec_s = {}, {}
+
+        bands_n = len(self.model_params.param["band"])
+
+        audio_file = spec_utils.write_array_to_mem(self.audio_file_path, subtype=self.wav_subtype)
+        is_mp3 = audio_file.endswith(".mp3") if isinstance(audio_file, str) else False
+
+        self.logger.debug(f"loading_mix iteraring through {bands_n} bands")
+        for d in tqdm(range(bands_n, 0, -1)):
+            bp = self.model_params.param["band"][d]
+
+            wav_resolution = bp["res_type"]
+
+            if self.torch_device_mps is not None:
+                wav_resolution = "polyphase"
+
+            if d == bands_n:  # high-end band
+                X_wave[d], _ = librosa.load(audio_file, sr=bp["sr"], mono=False, dtype=np.float32, res_type=wav_resolution)
+                X_spec_s[d] = spec_utils.wave_to_spectrogram(X_wave[d], bp["hl"], bp["n_fft"], self.model_params, band=d, is_v51_model=self.is_vr_51_model)
+
+                if not np.any(X_wave[d]) and is_mp3:
+                    X_wave[d] = rerun_mp3(audio_file, bp["sr"])
+
+                if X_wave[d].ndim == 1:
+                    X_wave[d] = np.asarray([X_wave[d], X_wave[d]])
+            else:  # lower bands
+                X_wave[d] = librosa.resample(X_wave[d + 1], orig_sr=self.model_params.param["band"][d + 1]["sr"], target_sr=bp["sr"], res_type=wav_resolution)
+                X_spec_s[d] = spec_utils.wave_to_spectrogram(X_wave[d], bp["hl"], bp["n_fft"], self.model_params, band=d, is_v51_model=self.is_vr_51_model)
+
+            if d == bands_n and self.high_end_process:
+                self.input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (self.model_params.param["pre_filter_stop"] - self.model_params.param["pre_filter_start"])
+                self.input_high_end = X_spec_s[d][:, bp["n_fft"] // 2 - self.input_high_end_h : bp["n_fft"] // 2, :]
+
+        X_spec = spec_utils.combine_spectrograms(X_spec_s, self.model_params, is_v51_model=self.is_vr_51_model)
+
+        del X_wave, X_spec_s, audio_file
+
+        return X_spec
+
+    def inference_vr(self, X_spec, device, aggressiveness):
+        def _execute(X_mag_pad, roi_size):
+            X_dataset = []
+            patches = (X_mag_pad.shape[2] - 2 * self.model_run.offset) // roi_size
+
+            self.logger.debug(f"inference_vr appending to X_dataset for each of {patches} patches")
+            for i in tqdm(range(patches)):
+                start = i * roi_size
+                X_mag_window = X_mag_pad[:, :, start : start + self.window_size]
+                X_dataset.append(X_mag_window)
+
+            total_iterations = patches // self.batch_size if not self.enable_tta else (patches // self.batch_size) * 2
+            self.logger.debug(f"inference_vr iterating through {total_iterations} batches, batch_size = {self.batch_size}")
+
+            X_dataset = np.asarray(X_dataset)
+            self.model_run.eval()
+            with torch.no_grad():
+                mask = []
+
+                for i in tqdm(range(0, patches, self.batch_size)):
+
+                    X_batch = X_dataset[i : i + self.batch_size]
+                    X_batch = torch.from_numpy(X_batch).to(device)
+                    pred = self.model_run.predict_mask(X_batch)
+                    if not pred.size()[3] > 0:
+                        raise ValueError(f"Window size error: h1_shape[3] must be greater than h2_shape[3]")
+                    pred = pred.detach().cpu().numpy()
+                    pred = np.concatenate(pred, axis=2)
+                    mask.append(pred)
+                if len(mask) == 0:
+                    raise ValueError(f"Window size error: h1_shape[3] must be greater than h2_shape[3]")
+
+                mask = np.concatenate(mask, axis=2)
+            return mask
+
+        def postprocess(mask, X_mag, X_phase):
+            is_non_accom_stem = False
+            for stem in CommonSeparator.NON_ACCOM_STEMS:
+                if stem == self.primary_stem_name:
+                    is_non_accom_stem = True
+
+            mask = spec_utils.adjust_aggr(mask, is_non_accom_stem, aggressiveness)
+
+            if self.enable_post_process:
+                mask = spec_utils.merge_artifacts(mask, thres=self.post_process_threshold)
+
+            y_spec = mask * X_mag * np.exp(1.0j * X_phase)
+            v_spec = (1 - mask) * X_mag * np.exp(1.0j * X_phase)
+
+            return y_spec, v_spec
+
+        X_mag, X_phase = spec_utils.preprocess(X_spec)
+        n_frame = X_mag.shape[2]
+        pad_l, pad_r, roi_size = spec_utils.make_padding(n_frame, self.window_size, self.model_run.offset)
+        X_mag_pad = np.pad(X_mag, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
+        X_mag_pad /= X_mag_pad.max()
+        mask = _execute(X_mag_pad, roi_size)
+
+        if self.enable_tta:
+            pad_l += roi_size // 2
+            pad_r += roi_size // 2
+            X_mag_pad = np.pad(X_mag, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
+            X_mag_pad /= X_mag_pad.max()
+            mask_tta = _execute(X_mag_pad, roi_size)
+            mask_tta = mask_tta[:, :, roi_size // 2 :]
+            mask = (mask[:, :, :n_frame] + mask_tta[:, :, :n_frame]) * 0.5
+        else:
+            mask = mask[:, :, :n_frame]
+
+        y_spec, v_spec = postprocess(mask, X_mag, X_phase)
+
+        return y_spec, v_spec
+
+    def spec_to_wav(self, spec):
+        if self.high_end_process and isinstance(self.input_high_end, np.ndarray) and self.input_high_end_h:
+            input_high_end_ = spec_utils.mirroring("mirroring", spec, self.input_high_end, self.model_params)
+            wav = spec_utils.cmb_spectrogram_to_wave(spec, self.model_params, self.input_high_end_h, input_high_end_, is_v51_model=self.is_vr_51_model)
+        else:
+            wav = spec_utils.cmb_spectrogram_to_wave(spec, self.model_params, is_v51_model=self.is_vr_51_model)
+
+        return wav
+
+
+# Check if we really need the rerun_mp3 function, refactor or remove if not
+def rerun_mp3(audio_file, sample_rate=44100):
+    with audioread.audio_open(audio_file) as f:
+        track_length = int(f.duration)
+
+    return librosa.load(audio_file, duration=track_length, mono=False, sr=sample_rate)[0]
diff --git a/audio_separator/separator/common_separator.py b/audio_separator/separator/common_separator.py
new file mode 100644
index 0000000..297e6fa
--- /dev/null
+++ b/audio_separator/separator/common_separator.py
@@ -0,0 +1,232 @@
+""" This file contains the CommonSeparator class, common to all architecture-specific Separator classes. """
+
+from logging import Logger
+import os
+import numpy as np
+from pydub import AudioSegment
+from audio_separator.separator.uvr_lib_v5 import spec_utils
+
+
+class CommonSeparator:
+    """
+    This class contains the common methods and attributes common to all architecture-specific Separator classes.
+    """
+
+    ALL_STEMS = "All Stems"
+    VOCAL_STEM = "Vocals"
+    INST_STEM = "Instrumental"
+    OTHER_STEM = "Other"
+    BASS_STEM = "Bass"
+    DRUM_STEM = "Drums"
+    GUITAR_STEM = "Guitar"
+    PIANO_STEM = "Piano"
+    SYNTH_STEM = "Synthesizer"
+    STRINGS_STEM = "Strings"
+    WOODWINDS_STEM = "Woodwinds"
+    BRASS_STEM = "Brass"
+    WIND_INST_STEM = "Wind Inst"
+    NO_OTHER_STEM = "No Other"
+    NO_BASS_STEM = "No Bass"
+    NO_DRUM_STEM = "No Drums"
+    NO_GUITAR_STEM = "No Guitar"
+    NO_PIANO_STEM = "No Piano"
+    NO_SYNTH_STEM = "No Synthesizer"
+    NO_STRINGS_STEM = "No Strings"
+    NO_WOODWINDS_STEM = "No Woodwinds"
+    NO_WIND_INST_STEM = "No Wind Inst"
+    NO_BRASS_STEM = "No Brass"
+    PRIMARY_STEM = "Primary Stem"
+    SECONDARY_STEM = "Secondary Stem"
+    LEAD_VOCAL_STEM = "lead_only"
+    BV_VOCAL_STEM = "backing_only"
+    LEAD_VOCAL_STEM_I = "with_lead_vocals"
+    BV_VOCAL_STEM_I = "with_backing_vocals"
+    LEAD_VOCAL_STEM_LABEL = "Lead Vocals"
+    BV_VOCAL_STEM_LABEL = "Backing Vocals"
+
+    NON_ACCOM_STEMS = (VOCAL_STEM, OTHER_STEM, BASS_STEM, DRUM_STEM, GUITAR_STEM, PIANO_STEM, SYNTH_STEM, STRINGS_STEM, WOODWINDS_STEM, BRASS_STEM, WIND_INST_STEM)
+
+    def __init__(self, config):
+
+        self.logger: Logger = config.get("logger")
+
+        # Inferencing device / acceleration config
+        self.torch_device = config.get("torch_device")
+        self.torch_device_cpu = config.get("torch_device_cpu")
+        self.torch_device_mps = config.get("torch_device_mps")
+        self.onnx_execution_provider = config.get("onnx_execution_provider")
+
+        # Model data
+        self.model_name = config.get("model_name")
+        self.model_path = config.get("model_path")
+        self.model_data = config.get("model_data")
+
+        # Optional custom output paths for the primary and secondary stems
+        # If left as None, the arch-specific class decides the output filename, e.g. something like:
+        # f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}"
+        self.primary_stem_output_path = config.get("primary_stem_output_path")
+        self.secondary_stem_output_path = config.get("secondary_stem_output_path")
+
+        # Output directory and format
+        self.output_dir = config.get("output_dir")
+        self.output_format = config.get("output_format")
+
+        # Functional options which are applicable to all architectures and the user may tweak to affect the output
+        self.normalization_threshold = config.get("normalization_threshold")
+        self.enable_denoise = config.get("enable_denoise")
+        self.output_single_stem = config.get("output_single_stem")
+        self.invert_using_spec = config.get("invert_using_spec")
+        self.sample_rate = config.get("sample_rate")
+
+        # Model specific properties
+        self.primary_stem_name = self.model_data["primary_stem"]
+        self.secondary_stem_name = "Vocals" if self.primary_stem_name == "Instrumental" else "Instrumental"
+        self.is_karaoke = self.model_data.get("is_karaoke", False)
+        self.is_bv_model = self.model_data.get("is_bv_model", False)
+        self.bv_model_rebalance = self.model_data.get("is_bv_model_rebalanced", 0)
+
+        # In UVR, these variables are set but either aren't useful or are better handled in audio-separator.
+        # Leaving these comments explaining to help myself or future developers understand why these aren't in audio-separator.
+
+        # "chunks" is not actually used for anything in UVR...
+        # self.chunks = 0
+
+        # "adjust" is hard-coded to 1 in UVR, and only used as a multiplier in run_model, so it does nothing.
+        # self.adjust = 1
+
+        # "hop" is hard-coded to 1024 in UVR. We have a "hop_length" parameter instead
+        # self.hop = 1024
+
+        # "margin" maps to sample rate and is set from the GUI in UVR (default: 44100). We have a "sample_rate" parameter instead.
+        # self.margin = 44100
+
+        # "dim_c" is hard-coded to 4 in UVR, seems to be a parameter for the number of channels, and is only used for checkpoint models.
+        # We haven't implemented support for the checkpoint models here, so we're not using it.
+        # self.dim_c = 4
+
+        self.logger.debug(f"Common params: model_name={self.model_name}, model_path={self.model_path}")
+        self.logger.debug(f"Common params: primary_stem_output_path={self.primary_stem_output_path}, secondary_stem_output_path={self.secondary_stem_output_path}")
+        self.logger.debug(f"Common params: output_dir={self.output_dir}, output_format={self.output_format}")
+        self.logger.debug(f"Common params: normalization_threshold={self.normalization_threshold}")
+        self.logger.debug(f"Common params: enable_denoise={self.enable_denoise}, output_single_stem={self.output_single_stem}")
+        self.logger.debug(f"Common params: invert_using_spec={self.invert_using_spec}, sample_rate={self.sample_rate}")
+
+        self.logger.debug(f"Common params: primary_stem_name={self.primary_stem_name}, secondary_stem_name={self.secondary_stem_name}")
+        self.logger.debug(f"Common params: is_karaoke={self.is_karaoke}, is_bv_model={self.is_bv_model}, bv_model_rebalance={self.bv_model_rebalance}")        
+
+        self.cached_sources_map = {}
+
+    def separate(self, audio_file_path):
+        """
+        Placeholder method for separating audio sources. Should be overridden by subclasses.
+        """
+        raise NotImplementedError("This method should be overridden by subclasses.")
+
+    def final_process(self, stem_path, source, stem_name):
+        """
+        Finalizes the processing of a stem by writing the audio to a file and returning the processed source.
+        """
+        self.logger.debug(f"Finalizing {stem_name} stem processing and writing audio...")
+        self.write_audio(stem_path, source)
+
+        return {stem_name: source}
+
+    def cached_sources_clear(self):
+        """
+        Clears the cache dictionaries for VR, MDX, and Demucs models.
+
+        This function is essential for ensuring that the cache does not hold outdated or irrelevant data
+        between different processing sessions or when a new batch of audio files is processed.
+        It helps in managing memory efficiently and prevents potential errors due to stale data.
+        """
+        self.cached_sources_map = {}
+
+    def cached_source_callback(self, model_architecture, model_name=None):
+        """
+        Retrieves the model and sources from the cache based on the processing method and model name.
+
+        Args:
+            model_architecture: The architecture type (VR, MDX, or Demucs) being used for processing.
+            model_name: The specific model name within the architecture type, if applicable.
+
+        Returns:
+            A tuple containing the model and its sources if found in the cache; otherwise, None.
+
+        This function is crucial for optimizing performance by avoiding redundant processing.
+        If the requested model and its sources are already in the cache, they can be reused directly,
+        saving time and computational resources.
+        """
+        model, sources = None, None
+
+        mapper = self.cached_sources_map[model_architecture]
+
+        for key, value in mapper.items():
+            if model_name in key:
+                model = key
+                sources = value
+
+        return model, sources
+
+    def cached_model_source_holder(self, model_architecture, sources, model_name=None):
+        """
+        Update the dictionary for the given model_architecture with the new model name and its sources.
+        Use the model_architecture as a key to access the corresponding cache source mapper dictionary.
+        """
+        self.cached_sources_map[model_architecture] = {**self.cached_sources_map.get(model_architecture, {}), **{model_name: sources}}
+
+    def write_audio(self, stem_path: str, stem_source):
+        """
+        Writes the separated audio source to a file.
+        """
+        self.logger.debug(f"Entering write_audio with stem_path: {stem_path}")
+
+        stem_source = spec_utils.normalize(wave=stem_source, max_peak=self.normalization_threshold)
+
+        # Check if the numpy array is empty or contains very low values
+        if np.max(np.abs(stem_source)) < 1e-6:
+            self.logger.warning("Warning: stem_source array is near-silent or empty.")
+            return
+
+        # If output_dir is specified, create it and join it with stem_path
+        if self.output_dir:
+            os.makedirs(self.output_dir, exist_ok=True)
+            stem_path = os.path.join(self.output_dir, stem_path)
+
+        self.logger.debug(f"Audio data shape before processing: {stem_source.shape}")
+        self.logger.debug(f"Data type before conversion: {stem_source.dtype}")
+
+        # Ensure the audio data is in the correct format (e.g., int16)
+        if stem_source.dtype != np.int16:
+            stem_source = (stem_source * 32767).astype(np.int16)
+            self.logger.debug("Converted stem_source to int16.")
+
+        # Correctly interleave stereo channels
+        stem_source_interleaved = np.empty((2 * stem_source.shape[0],), dtype=np.int16)
+        stem_source_interleaved[0::2] = stem_source[:, 0]  # Left channel
+        stem_source_interleaved[1::2] = stem_source[:, 1]  # Right channel
+
+        self.logger.debug(f"Interleaved audio data shape: {stem_source_interleaved.shape}")
+
+        # Create a pydub AudioSegment
+        try:
+            audio_segment = AudioSegment(stem_source_interleaved.tobytes(), frame_rate=self.sample_rate, sample_width=stem_source.dtype.itemsize, channels=2)
+            self.logger.debug("Created AudioSegment successfully.")
+        except (IOError, ValueError) as e:
+            self.logger.error(f"Specific error creating AudioSegment: {e}")
+            return
+
+        # Determine file format based on the file extension
+        file_format = stem_path.lower().split(".")[-1]
+
+        # For m4a files, specify mp4 as the container format as the extension doesn't match the format name
+        if file_format == "m4a":
+            file_format = "mp4"
+        elif file_format == "mka":
+            file_format = "matroska"
+
+        # Export using the determined format
+        try:
+            audio_segment.export(stem_path, format=file_format)
+            self.logger.debug(f"Exported audio file successfully to {stem_path}")
+        except (IOError, ValueError) as e:
+            self.logger.error(f"Error exporting audio file: {e}")
diff --git a/audio_separator/separator/separator.py b/audio_separator/separator/separator.py
index 2b4263c..e8c009f 100644
--- a/audio_separator/separator/separator.py
+++ b/audio_separator/separator/separator.py
@@ -1,3 +1,6 @@
+""" This file contains the Separator class, to facilitate the separation of stems from audio. """
+
+from importlib import metadata
 import os
 import gc
 import platform
@@ -8,36 +11,67 @@
 import warnings
 import requests
 import torch
-import librosa
-import numpy as np
 import onnxruntime as ort
-from importlib import metadata
-from onnx2torch import convert
-from pydub import AudioSegment
-from audio_separator.separator import spec_utils
-from audio_separator.separator.stft import STFT
-from tqdm import tqdm
+from audio_separator.separator.architectures import MDXSeparator, VRSeparator
+
 
 class Separator:
+    """
+    The Separator class is designed to facilitate the separation of audio sources from a given audio file.
+    It supports various separation architectures and models, including MDX and VR. The class provides
+    functionalities to configure separation parameters, load models, and perform audio source separation.
+    It also handles logging, normalization, and output formatting of the separated audio stems.
+
+    The actual separation task is handled by one of the architecture-specific classes in the `architectures` module;
+    this class is responsible for initialising logging, configuring hardware acceleration, loading the model,
+    initiating the separation process and passing outputs back to the caller.
+
+    Common Attributes:
+        log_level (int): The logging level.
+        log_formatter (logging.Formatter): The logging formatter.
+        model_file_dir (str): The directory where model files are stored.
+        output_dir (str): The directory where output files will be saved.
+        primary_stem_output_path (str): The path for saving the primary stem.
+        secondary_stem_output_path (str): The path for saving the secondary stem.
+        output_format (str): The format of the output audio file.
+        normalization_threshold (float): The threshold for audio normalization.
+        enable_denoise (bool): Flag to enable or disable denoising.
+        output_single_stem (str): Option to output a single stem.
+        invert_using_spec (bool): Flag to invert using spectrogram.
+        sample_rate (int): The sample rate of the audio.
+
+    MDX Architecture Specific Attributes:
+        hop_length (int): The hop length for STFT.
+        segment_size (int): The segment size for processing.
+        overlap (float): The overlap between segments.
+        batch_size (int): The batch size for processing.
+
+    VR Architecture Specific Attributes & Defaults:
+        batch_size: 16
+        window_size: 512
+        aggression: 5
+        enable_tta: False
+        enable_post_process: False
+        post_process_threshold: 0.2
+        high_end_process: False
+    """
+
     def __init__(
         self,
         log_level=logging.DEBUG,
         log_formatter=None,
         model_file_dir="/tmp/audio-separator-models/",
         output_dir=None,
-        primary_stem_path=None,
-        secondary_stem_path=None,
+        primary_stem_output_path=None,
+        secondary_stem_output_path=None,
         output_format="WAV",
-        output_subtype=None,
         normalization_threshold=0.9,
-        denoise_enabled=False,
+        enable_denoise=False,
         output_single_stem=None,
         invert_using_spec=False,
         sample_rate=44100,
-        hop_length=1024,
-        segment_size=256,
-        overlap=0.25,
-        batch_size=1,
+        mdx_params={"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1},
+        vr_params={"batch_size": 16, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False},
     ):
         self.logger = logging.getLogger(__name__)
         self.logger.setLevel(log_level)
@@ -64,143 +98,148 @@ def __init__(
 
         self.model_file_dir = model_file_dir
         self.output_dir = output_dir
-        self.primary_stem_path = primary_stem_path
-        self.secondary_stem_path = secondary_stem_path
+
+        # Allow the user to specify the output paths for the primary and secondary stems
+        # If left as None, the arch-specific class decides the output filename, typically e.g. something like:
+        # f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}"
+        self.primary_stem_output_path = primary_stem_output_path
+        self.secondary_stem_output_path = secondary_stem_output_path
 
         # Create the model directory if it does not exist
         os.makedirs(self.model_file_dir, exist_ok=True)
 
-        self.output_subtype = output_subtype
         self.output_format = output_format
 
         if self.output_format is None:
             self.output_format = "WAV"
 
-        if self.output_subtype is None and output_format == "WAV":
-            self.output_subtype = "PCM_16"
-
         self.normalization_threshold = normalization_threshold
-        self.logger.debug(
-            f"Normalization threshold set to {normalization_threshold}, waveform will lowered to this max amplitude to avoid clipping."
-        )
+        self.logger.debug(f"Normalization threshold set to {normalization_threshold}, waveform will lowered to this max amplitude to avoid clipping.")
 
-        self.denoise_enabled = denoise_enabled
-        if self.denoise_enabled:
+        self.enable_denoise = enable_denoise
+        if self.enable_denoise:
             self.logger.debug(f"Denoising enabled, model will be run twice to reduce noise in output audio.")
         else:
-            self.logger.debug(
-                f"Denoising disabled, model will only be run once. This is twice as fast, but may result in noisier output audio."
-            )
+            self.logger.debug(f"Denoising disabled, model will only be run once. This is twice as fast, but may result in noisier output audio.")
 
         self.output_single_stem = output_single_stem
         if output_single_stem is not None:
             if output_single_stem.lower() not in {"instrumental", "vocals"}:
-                raise Exception("output_single_stem must be either 'instrumental' or 'vocals'")
+                raise ValueError("output_single_stem must be either 'instrumental' or 'vocals'")
             self.logger.debug(f"Single stem output requested, only one output file ({output_single_stem}) will be written")
 
         self.invert_using_spec = invert_using_spec
         if self.invert_using_spec:
-            self.logger.debug(
-                f"Secondary step will be inverted using spectogram rather than waveform. This may improve quality, but is slightly slower."
-            )
+            self.logger.debug(f"Secondary step will be inverted using spectogram rather than waveform. This may improve quality, but is slightly slower.")
 
         self.sample_rate = sample_rate
-        self.hop_length = hop_length
-        self.segment_size = segment_size
-        self.overlap = overlap
-        self.batch_size = batch_size
-        self.logger.debug(
-            f"Separation settings set: sample_rate={self.sample_rate}, hop_length={self.hop_length}, segment_size={self.segment_size}, overlap={self.overlap}, batch_size={self.batch_size}"
-        )
 
-        self.setup_inferencing_device()
+        # These are parameters which users may want to configure so we expose them to the top-level Separator class,
+        # even though they are specific to a single model architecture
+        self.arch_specific_params = {"MDX": mdx_params, "VR": vr_params}
+
+        self.torch_device = None
+        self.torch_device_cpu = None
+        self.torch_device_mps = None
+
+        self.onnx_execution_provider = None
+        self.model_instance = None
+        self.audio_file_path = None
+        self.audio_file_base = None
+        self.primary_source = None
+        self.secondary_source = None
+
+        self.setup_accelerated_inferencing_device()
 
-    def setup_inferencing_device(self):
-        self.logger.info(f"Checking hardware specifics to configure acceleration")
+    def setup_accelerated_inferencing_device(self):
+        """
+        This method sets up the PyTorch and/or ONNX Runtime inferencing device, using GPU hardware acceleration if available.
+        """
+        self.log_system_info()
+        self.log_onnxruntime_packages()
+        self.setup_torch_device()
 
+    def log_system_info(self):
+        """
+        This method logs the system information, including the operating system, CPU archutecture and Python version
+        """
         os_name = platform.system()
         os_version = platform.version()
         self.logger.info(f"Operating System: {os_name} {os_version}")
 
         system_info = platform.uname()
-        self.logger.info(
-            f"System: {system_info.system} Node: {system_info.node} Release: {system_info.release} Machine: {system_info.machine} Proc: {system_info.processor}"
-        )
+        self.logger.info(f"System: {system_info.system} Node: {system_info.node} Release: {system_info.release} Machine: {system_info.machine} Proc: {system_info.processor}")
 
         python_version = platform.python_version()
         self.logger.info(f"Python Version: {python_version}")
 
+    def log_onnxruntime_packages(self):
+        """
+        This method logs the ONNX Runtime package versions, including the GPU and Silicon packages if available.
+        """
         onnxruntime_gpu_package = self.get_package_distribution("onnxruntime-gpu")
+        onnxruntime_silicon_package = self.get_package_distribution("onnxruntime-silicon")
+        onnxruntime_cpu_package = self.get_package_distribution("onnxruntime")
+
         if onnxruntime_gpu_package is not None:
             self.logger.info(f"ONNX Runtime GPU package installed with version: {onnxruntime_gpu_package.version}")
-
-        onnxruntime_silicon_package = self.get_package_distribution("onnxruntime-silicon")
         if onnxruntime_silicon_package is not None:
             self.logger.info(f"ONNX Runtime Silicon package installed with version: {onnxruntime_silicon_package.version}")
-
-        onnxruntime_cpu_package = self.get_package_distribution("onnxruntime")
         if onnxruntime_cpu_package is not None:
             self.logger.info(f"ONNX Runtime CPU package installed with version: {onnxruntime_cpu_package.version}")
 
-        torch_package = self.get_package_distribution("torch")
-        if torch_package is not None:
-            self.logger.info(f"Torch package installed with version: {torch_package.version}")
-
-        torchvision_package = self.get_package_distribution("torchvision")
-        if torchvision_package is not None:
-            self.logger.info(f"Torchvision package installed with version: {torchvision_package.version}")
-
-        torchaudio_package = self.get_package_distribution("torchaudio")
-        if torchaudio_package is not None:
-            self.logger.info(f"Torchaudio package installed with version: {torchaudio_package.version}")
-
-        ort_device = ort.get_device()
+    def setup_torch_device(self):
+        """
+        This method sets up the PyTorch and/or ONNX Runtime inferencing device, using GPU hardware acceleration if available.
+        """
+        hardware_acceleration_enabled = False
         ort_providers = ort.get_available_providers()
 
-        self.cpu = torch.device("cpu")
-        hardware_acceleration_enabled = False
+        self.torch_device_cpu = torch.device("cpu")
 
-        # Prepare for hardware-accelerated inference by validating both Torch and ONNX Runtime support either CUDA or CoreML
         if torch.cuda.is_available():
-            self.logger.info("CUDA is available in Torch, setting Torch device to CUDA")
-            self.device = torch.device("cuda")
-
-            if onnxruntime_gpu_package is not None and ort_device == "GPU" and "CUDAExecutionProvider" in ort_providers:
-                self.logger.info("ONNXruntime has CUDAExecutionProvider available, enabling acceleration")
-                self.onnx_execution_provider = ["CUDAExecutionProvider"]
-                hardware_acceleration_enabled = True
-            else:
-                self.logger.warning("CUDAExecutionProvider not available in ONNXruntime, so acceleration will NOT be enabled")
-                self.logger.warning("If you expect CUDA to work with your GPU, try pip install --force-reinstall onnxruntime-gpu")
-        else:
-            self.logger.debug("CUDA not available in Torch installation. If you expect GPU/CUDA support to work, please see README")
+            self.configure_cuda(ort_providers)
+            hardware_acceleration_enabled = True
+        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            self.configure_mps(ort_providers)
+            hardware_acceleration_enabled = True
 
-        if onnxruntime_silicon_package is not None and hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-            self.logger.info("Apple Silicon MPS/CoreML is available in Torch, setting Torch device to MPS")
+        if not hardware_acceleration_enabled:
+            self.logger.info("No hardware acceleration could be configured, running in CPU mode")
+            self.torch_device = self.torch_device_cpu
+            self.onnx_execution_provider = ["CPUExecutionProvider"]
 
-            # TODO: Change this to use MPS once FFTs are supported, see https://github.com/pytorch/pytorch/issues/78044
-            # self.device = torch.device("mps")
+    def configure_cuda(self, ort_providers):
+        """
+        This method configures the CUDA device for PyTorch and ONNX Runtime, if available.
+        """
+        self.logger.info("CUDA is available in Torch, setting Torch device to CUDA")
+        self.torch_device = torch.device("cuda")
+        if "CUDAExecutionProvider" in ort_providers:
+            self.logger.info("ONNXruntime has CUDAExecutionProvider available, enabling acceleration")
+            self.onnx_execution_provider = ["CUDAExecutionProvider"]
+        else:
+            self.logger.warning("CUDAExecutionProvider not available in ONNXruntime, so acceleration will NOT be enabled")
 
-            self.logger.warning("Torch MPS backend does not yet support FFT operations, Torch will still use CPU!")
-            self.logger.warning("To track progress towards Apple Silicon acceleration, see https://github.com/pytorch/pytorch/issues/78044")
-            self.device = torch.device("cpu")
+    def configure_mps(self, ort_providers):
+        """
+        This method configures the Apple Silicon MPS/CoreML device for PyTorch and ONNX Runtime, if available.
+        """
+        self.logger.info("Apple Silicon MPS/CoreML is available in Torch, setting Torch device to MPS")
+        self.torch_device_mps = torch.device("mps")
 
-            if "CoreMLExecutionProvider" in ort_providers:
-                self.logger.info("ONNXruntime has CoreMLExecutionProvider available, enabling acceleration")
-                self.onnx_execution_provider = ["CoreMLExecutionProvider"]
-                hardware_acceleration_enabled = True
-            else:
-                self.logger.warning("CoreMLExecutionProvider not available in ONNXruntime, so acceleration will NOT be enabled")
-                self.logger.warning("If you expect MPS/CoreML to work with your Mac, try pip install --force-reinstall onnxruntime-silicon")
-        else:
-            self.logger.debug("Apple Silicon MPS/CoreML not available in Torch installation. If you expect this to work, please see README")
+        self.torch_device = self.torch_device_mps
 
-        if not hardware_acceleration_enabled:
-            self.logger.info("No hardware acceleration could be configured, running in CPU mode")
-            self.device = torch.device("cpu")
-            self.onnx_execution_provider = ["CPUExecutionProvider"]
+        if "CoreMLExecutionProvider" in ort_providers:
+            self.logger.info("ONNXruntime has CoreMLExecutionProvider available, enabling acceleration")
+            self.onnx_execution_provider = ["CoreMLExecutionProvider"]
+        else:
+            self.logger.warning("CoreMLExecutionProvider not available in ONNXruntime, so acceleration will NOT be enabled")
 
     def get_package_distribution(self, package_name):
+        """
+        This method returns the package distribution for a given package name if installed, or None otherwise.
+        """
         try:
             return metadata.distribution(package_name)
         except metadata.PackageNotFoundError:
@@ -208,15 +247,30 @@ def get_package_distribution(self, package_name):
             return None
 
     def get_model_hash(self, model_path):
+        """
+        This method returns the MD5 hash of a given model file.
+        """
+
+        self.logger.error(f"Attempting to calculate hash of model file {model_path}")
         try:
+            # Open the model file in binary read mode
             with open(model_path, "rb") as f:
+                # Move the file pointer 10MB before the end of the file
                 f.seek(-10000 * 1024, 2)
+                # Read the file from the current pointer to the end and calculate its MD5 hash
                 return hashlib.md5(f.read()).hexdigest()
-        except:
+        except IOError as e:
+            # If an IOError occurs (e.g., if the file is less than 10MB large), log the error
+            self.logger.error(f"IOError seeking -10MB or reading model file for hash calculation: {e}")
+            # Attempt to open the file again, read its entire content, and calculate the MD5 hash
             return hashlib.md5(open(model_path, "rb").read()).hexdigest()
 
     def download_file(self, url, output_path):
-        response = requests.get(url, stream=True)
+        """
+        This method downloads a file from a given URL to a given output path.
+        """
+        self.logger.debug(f"Downloading file from {url} to {output_path} with timeout 300s")
+        response = requests.get(url, stream=True, timeout=300)
 
         if response.status_code == 200:
             with open(output_path, "wb") as f:
@@ -225,191 +279,269 @@ def download_file(self, url, output_path):
         else:
             self.logger.error(f"Failed to download file from {url}")
 
-    def final_process(self, stem_path, source, stem_name, sample_rate):
-        self.logger.debug(f"Finalizing {stem_name} stem processing and writing audio...")
-        self.write_audio(stem_path, source, sample_rate, stem_name=stem_name)
-
-        return {stem_name: source}
-
     def clear_gpu_cache(self):
+        """
+        This method clears the GPU cache to free up memory.
+        """
         self.logger.debug("Running garbage collection...")
         gc.collect()
-        if self.device == torch.device("mps"):
+        if self.torch_device == torch.device("mps"):
             self.logger.debug("Clearing MPS cache...")
             torch.mps.empty_cache()
-        if self.device == torch.device("cuda"):
+        if self.torch_device == torch.device("cuda"):
             self.logger.debug("Clearing CUDA cache...")
             torch.cuda.empty_cache()
 
-    def load_model(self, model_name="UVR-MDX-NET-Inst_HQ_3"):
-        self.logger.info(f"Loading model {model_name}...")
-
-        self.load_model_start_time = time.perf_counter()
-
-        self.model_name = model_name
-        self.model_url = f"https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/{self.model_name}.onnx"
-        self.model_data_url = "https://raw.githubusercontent.com/TRvlvr/application_data/main/mdx_model_data/model_data.json"
+    def list_supported_model_files(self):
+        """
+        This method lists the supported model files for audio-separator, by fetching the same file UVR uses to list these.
+        """
+        download_checks_path = os.path.join(self.model_file_dir, "download_checks.json")
+
+        if not os.path.isfile(download_checks_path):
+            self.download_file("https://raw.githubusercontent.com/TRvlvr/application_data/main/filelists/download_checks.json", download_checks_path)
+
+        model_downloads_list = json.load(open(download_checks_path, encoding="utf-8"))
+        self.logger.debug(f"Model download list loaded: {model_downloads_list}")
+
+        # model_downloads_list JSON structure / example snippet:
+        # {
+        #     "vr_download_list": {
+        #             "VR Arch Single Model v5: 1_HP-UVR": "1_HP-UVR.pth",
+        #             "VR Arch Single Model v5: UVR-DeNoise by FoxJoy": "UVR-DeNoise.pth",
+        #     },
+        #     "mdx_download_list": {
+        #             "MDX-Net Model: UVR-MDX-NET Inst HQ 3": "UVR-MDX-NET-Inst_HQ_3.onnx",
+        #             "MDX-Net Model: UVR-MDX-NET Karaoke 2": "UVR_MDXNET_KARA_2.onnx",
+        #             "MDX-Net Model: Kim Vocal 2": "Kim_Vocal_2.onnx",
+        #             "MDX-Net Model: kuielab_b_drums": "kuielab_b_drums.onnx"
+        #     },
+        #     "demucs_download_list": {
+        #             "Demucs v4: htdemucs_ft": {
+        #                     "f7e0c4bc-ba3fe64a.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/f7e0c4bc-ba3fe64a.th",
+        #                     "d12395a8-e57c48e6.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/d12395a8-e57c48e6.th",
+        #                     "92cfc3b6-ef3bcb9c.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/92cfc3b6-ef3bcb9c.th",
+        #                     "04573f0d-f3cf25b2.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/04573f0d-f3cf25b2.th",
+        #                     "htdemucs_ft.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs_ft.yaml"
+        #             },
+        #             "Demucs v4: htdemucs": {
+        #                     "955717e8-8726e21a.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/955717e8-8726e21a.th",
+        #                     "htdemucs.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs.yaml"
+        #             },
+        #             "Demucs v1: tasnet": {
+        #                     "tasnet.th": "https://dl.fbaipublicfiles.com/demucs/v2.0/tasnet.th"
+        #             },
+        #     },
+        #     "mdx23_download_list": {
+        #             "MDX23C Model: MDX23C_D1581": {
+        #                     "MDX23C_D1581.ckpt": "model_2_stem_061321.yaml"
+        #             }
+        #     },
+        #     "mdx23c_download_list": {
+        #             "MDX23C Model: MDX23C-InstVoc HQ": {
+        #                     "MDX23C-8KFFT-InstVoc_HQ.ckpt": "model_2_stem_full_band_8k.yaml"
+        #             }
+        #     }
+        # }
+
+        # Return object with list of model names, which are the keys in vr_download_list, mdx_download_list, demucs_download_list, mdx23_download_list, mdx23c_download_list, grouped by type: VR, MDX, Demucs, MDX23, MDX23C
+        model_files_grouped = {
+            "VR": model_downloads_list["vr_download_list"],
+            "MDX": model_downloads_list["mdx_download_list"],
+            # "Demucs": list(model_downloads_list["demucs_download_list"].keys()),
+            # "MDX23": list(model_downloads_list["mdx23_download_list"].keys()),
+            # "MDX23C": list(model_downloads_list["mdx23c_download_list"].keys())
+        }
+        return model_files_grouped
+
+    def load_model(self, model_filename="2_HP-UVR.pth"):
+        """
+        This method loads the separation model into memory, downloading it first if necessary.
+        """
+        self.logger.info(f"Loading model {model_filename}...")
+
+        load_model_start_time = time.perf_counter()
+
+        # Model data and configuration sources from UVR
+        model_repo_url_prefix = "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models"
+        model_data_url_prefix = "https://raw.githubusercontent.com/TRvlvr/application_data/main"
+        vr_model_data_url = f"{model_data_url_prefix}/vr_model_data/model_data_new.json"
+        mdx_model_data_url = f"{model_data_url_prefix}/mdx_model_data/model_data_new.json"
 
         # Setting up the model path
-        model_path = os.path.join(self.model_file_dir, f"{self.model_name}.onnx")
+        model_name = model_filename.split(".")[0]
+        model_path = os.path.join(self.model_file_dir, f"{model_filename}")
         self.logger.debug(f"Model path set to {model_path}")
 
         # Check if model file exists, if not, download it
         if not os.path.isfile(model_path):
             self.logger.debug(f"Model not found at path {model_path}, downloading...")
-            self.download_file(self.model_url, model_path)
+            self.download_file(f"{model_repo_url_prefix}/{model_filename}", model_path)
 
-        # Reading model settings from the downloaded model
-        self.logger.debug("Reading model settings...")
+        # Calculating hash for the downloaded model
+        self.logger.debug("Calculating MD5 hash for model file to identify model parameters from UVR data...")
         model_hash = self.get_model_hash(model_path)
         self.logger.debug(f"Model {model_path} has hash {model_hash}")
 
         # Setting up the path for model data and checking its existence
-        model_data_path = os.path.join(self.model_file_dir, "model_data.json")
-        self.logger.debug(f"Model data path set to {model_data_path}")
-        if not os.path.isfile(model_data_path):
-            self.logger.debug(f"Model data not found at path {model_data_path}, downloading...")
-            self.download_file(self.model_data_url, model_data_path)
+        vr_model_data_path = os.path.join(self.model_file_dir, "vr_model_data.json")
+        self.logger.debug(f"VR model data path set to {vr_model_data_path}")
+        if not os.path.isfile(vr_model_data_path):
+            self.logger.debug(f"VR model data not found at path {vr_model_data_path}, downloading...")
+            self.download_file(vr_model_data_url, vr_model_data_path)
+
+        mdx_model_data_path = os.path.join(self.model_file_dir, "mdx_model_data.json")
+        self.logger.debug(f"MDX model data path set to {mdx_model_data_path}")
+        if not os.path.isfile(mdx_model_data_path):
+            self.logger.debug(f"MDX model data not found at path {mdx_model_data_path}, downloading...")
+            self.download_file(mdx_model_data_url, mdx_model_data_path)
 
         # Loading model data
-        self.logger.debug("Loading model data...")
-        model_data_object = json.load(open(model_data_path))
-        model_data = model_data_object[model_hash]
+        self.logger.debug("Loading MDX and VR model parameters from UVR model data files...")
+        vr_model_data_object = json.load(open(vr_model_data_path, encoding="utf-8"))
+        mdx_model_data_object = json.load(open(mdx_model_data_path, encoding="utf-8"))
+
+        # vr_model_data_object JSON structure / example snippet:
+        # {
+        #     "0d0e6d143046b0eecc41a22e60224582": {
+        #         "vr_model_param": "3band_44100_mid",
+        #         "primary_stem": "Instrumental"
+        #     },
+        #     "6b5916069a49be3fe29d4397ecfd73fa": {
+        #         "vr_model_param": "3band_44100_msb2",
+        #         "primary_stem": "Instrumental",
+        #         "is_karaoke": true
+        #     },
+        #     "0ec76fd9e65f81d8b4fbd13af4826ed8": {
+        #         "vr_model_param": "4band_v3",
+        #         "primary_stem": "No Woodwinds"
+        #     },
+        #     "0fb9249ffe4ffc38d7b16243f394c0ff": {
+        #         "vr_model_param": "4band_v3",
+        #         "primary_stem": "No Reverb"
+        #     },
+        #     "6857b2972e1754913aad0c9a1678c753": {
+        #         "vr_model_param": "4band_v3",
+        #         "primary_stem": "No Echo",
+        #         "nout": 48,
+        #         "nout_lstm": 128
+        #     },
+        #     "944950a9c5963a5eb70b445d67b7068a": {
+        #         "vr_model_param": "4band_v3_sn",
+        #         "primary_stem": "Vocals",
+        #         "nout": 64,
+        #         "nout_lstm": 128,
+        #         "is_karaoke": false,
+        #         "is_bv_model": true,
+        #         "is_bv_model_rebalanced": 0.9
+        #     }
+        # }
+
+        # mdx_model_data_object JSON structure / example snippet:
+        # {
+        #     "0ddfc0eb5792638ad5dc27850236c246": {
+        #         "compensate": 1.035,
+        #         "mdx_dim_f_set": 2048,
+        #         "mdx_dim_t_set": 8,
+        #         "mdx_n_fft_scale_set": 6144,
+        #         "primary_stem": "Vocals"
+        #     },
+        #     "26d308f91f3423a67dc69a6d12a8793d": {
+        #         "compensate": 1.035,
+        #         "mdx_dim_f_set": 2048,
+        #         "mdx_dim_t_set": 9,
+        #         "mdx_n_fft_scale_set": 8192,
+        #         "primary_stem": "Other"
+        #     },
+        #     "2cdd429caac38f0194b133884160f2c6": {
+        #         "compensate": 1.045,
+        #         "mdx_dim_f_set": 3072,
+        #         "mdx_dim_t_set": 8,
+        #         "mdx_n_fft_scale_set": 7680,
+        #         "primary_stem": "Instrumental"
+        #     },
+        #     "2f5501189a2f6db6349916fabe8c90de": {
+        #         "compensate": 1.035,
+        #         "mdx_dim_f_set": 2048,
+        #         "mdx_dim_t_set": 8,
+        #         "mdx_n_fft_scale_set": 6144,
+        #         "primary_stem": "Vocals",
+        #         "is_karaoke": true
+        #     },
+        #     "2154254ee89b2945b97a7efed6e88820": {
+        #         "config_yaml": "model_2_stem_061321.yaml"
+        #     },
+        #     "116f6f9dabb907b53d847ed9f7a9475f": {
+        #         "config_yaml": "model_2_stem_full_band_8k.yaml"
+        #     }
+        # }
+
+        if model_hash in mdx_model_data_object:
+            model_data = mdx_model_data_object[model_hash]
+            model_type = "MDX"
+        elif model_hash in vr_model_data_object:
+            model_data = vr_model_data_object[model_hash]
+            model_type = "VR"
+        else:
+            raise ValueError(f"Unsupported Model File: parameters for MD5 hash {model_hash} could not be found in the UVR model data file.")
+
         self.logger.debug(f"Model data loaded: {model_data}")
 
-        # Initializing model parameters
-        self.compensate, self.dim_f, self.dim_t, self.n_fft, self.model_primary_stem = (
-            model_data["compensate"],
-            model_data["mdx_dim_f_set"],
-            2 ** model_data["mdx_dim_t_set"],
-            model_data["mdx_n_fft_scale_set"],
-            model_data["primary_stem"],
-        )
-        self.model_secondary_stem = "Vocals" if self.model_primary_stem == "Instrumental" else "Instrumental"
-
-        # In UVR, these variables are set but either aren't useful or are better handled in audio-separator.
-        # Leaving these comments explaining to help myself or future developers understand why these aren't in audio-separator.
-
-        # "chunks" is not actually used for anything in UVR...
-        # self.chunks = 0
-
-        # "adjust" is hard-coded to 1 in UVR, and only used as a multiplier in run_model, so it does nothing.
-        # self.adjust = 1
-
-        # "hop" is hard-coded to 1024 in UVR. We have a "hop_length" parameter instead
-        # self.hop = 1024
-
-        # "margin" maps to sample rate and is set from the GUI in UVR (default: 44100). We have a "sample_rate" parameter instead.
-        # self.margin = 44100
-
-        # "dim_c" is hard-coded to 4 in UVR, seems to be a parameter for the number of channels, and is only used for checkpoint models.
-        # We haven't implemented support for the checkpoint models here, so we're not using it.
-        # self.dim_c = 4
-
-        self.logger.debug(f"Model params: primary_stem={self.model_primary_stem}, secondary_stem={self.model_secondary_stem}")
-        self.logger.debug(
-            f"Model params: batch_size={self.batch_size}, compensate={self.compensate}, segment_size={self.segment_size}, dim_f={self.dim_f}, dim_t={self.dim_t}"
-        )
-        self.logger.debug(f"Model params: n_fft={self.n_fft}, hop={self.hop_length}")
-
-        # Loading the model for inference
-        self.logger.debug("Loading ONNX model for inference...")
-        if self.segment_size == self.dim_t:
-            ort_ = ort.InferenceSession(model_path, providers=self.onnx_execution_provider)
-            self.model_run = lambda spek: ort_.run(None, {"input": spek.cpu().numpy()})[0]
-            self.logger.debug("Model loaded successfully using ONNXruntime inferencing session.")
+        common_params = {
+            "logger": self.logger,
+            "torch_device": self.torch_device,
+            "torch_device_cpu": self.torch_device_cpu,
+            "torch_device_mps": self.torch_device_mps,
+            "onnx_execution_provider": self.onnx_execution_provider,
+            "model_name": model_name,
+            "model_path": model_path,
+            "model_data": model_data,
+            "primary_stem_output_path": self.primary_stem_output_path,
+            "secondary_stem_output_path": self.secondary_stem_output_path,
+            "output_format": self.output_format,
+            "output_dir": self.output_dir,
+            "normalization_threshold": self.normalization_threshold,
+            "enable_denoise": self.enable_denoise,
+            "output_single_stem": self.output_single_stem,
+            "invert_using_spec": self.invert_using_spec,
+            "sample_rate": self.sample_rate,
+        }
+
+        if model_type == "MDX":
+            self.model_instance = MDXSeparator(common_config=common_params, arch_config=self.arch_specific_params["MDX"])
+        elif model_type == "VR":
+            self.model_instance = VRSeparator(common_config=common_params, arch_config=self.arch_specific_params["VR"])
         else:
-            self.model_run = convert(model_path)
-            self.model_run.to(self.device).eval()
-            self.logger.warning("Model converted from onnx to pytorch due to segment size not matching dim_t, processing may be slower.")
+            raise ValueError(f"Unsupported model type: {model_type}")
 
-        # Log the completion of the separation process
+        # Log the completion of the model load process
         self.logger.debug("Loading model completed.")
-        self.logger.info(
-            f'Load model duration: {time.strftime("%H:%M:%S", time.gmtime(int(time.perf_counter() - self.load_model_start_time)))}'
-        )
+        self.logger.info(f'Load model duration: {time.strftime("%H:%M:%S", time.gmtime(int(time.perf_counter() - load_model_start_time)))}')
 
     def separate(self, audio_file_path):
+        """
+        Separates the audio file into different stems (e.g., vocals, instruments) using the loaded model.
+
+        This method takes the path to an audio file, processes it through the loaded separation model, and returns
+        the paths to the output files containing the separated audio stems. It handles the entire flow from loading
+        the audio, running the separation, clearing up resources, and logging the process.
+
+        Parameters:
+        - audio_file_path (str): The path to the audio file to be separated.
+
+        Returns:
+        - output_files (list of str): A list containing the paths to the separated audio stem files.
+        """
         # Starting the separation process
         self.logger.info(f"Starting separation process for audio_file_path: {audio_file_path}")
-        self.separate_start_time = time.perf_counter()
-
-        self.primary_source = None
-        self.secondary_source = None
+        separate_start_time = time.perf_counter()
 
-        self.audio_file_path = audio_file_path
-        self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0]
-
-        # Prepare the mix for processing
-        self.logger.debug("Preparing mix...")
-        mix = self.prepare_mix(self.audio_file_path)
-
-        self.logger.debug("Normalizing mix before demixing...")
-        mix = spec_utils.normalize(self.logger, wave=mix, max_peak=self.normalization_threshold)
-
-        # Start the demixing process
-        source = self.demix(mix)
-
-        # In UVR, the source is cached here if it's a vocal split model, but we're not supporting that yet
-
-        # Initialize the list for output files
-        output_files = []
-        self.logger.debug("Processing output files...")
-
-        # Normalize and transpose the primary source if it's not already an array
-        if not isinstance(self.primary_source, np.ndarray):
-            self.logger.debug("Normalizing primary source...")
-            self.primary_source = spec_utils.normalize(self.logger, wave=source, max_peak=self.normalization_threshold).T
-
-        # Process the secondary source if not already an array
-        if not isinstance(self.secondary_source, np.ndarray):
-            self.logger.debug("Producing secondary source: demixing in match_mix mode")
-            raw_mix = self.demix(mix, is_match_mix=True)
-
-            if self.invert_using_spec:
-                self.logger.debug("Inverting secondary stem using spectogram as invert_using_spec is set to True")
-                self.secondary_source = spec_utils.invert_stem(raw_mix, source)
-            else:
-                self.logger.debug("Inverting secondary stem by subtracting of transposed demixed stem from transposed original mix")
-                self.secondary_source = mix.T - source.T
-
-        # Save and process the secondary stem if needed
-        if not self.output_single_stem or self.output_single_stem.lower() == self.model_secondary_stem.lower():
-            self.logger.info(f"Saving {self.model_secondary_stem} stem...")
-            if not self.secondary_stem_path:
-                self.secondary_stem_path = os.path.join(
-                    f"{self.audio_file_base}_({self.model_secondary_stem})_{self.model_name}.{self.output_format.lower()}"
-                )
-            self.secondary_source_map = self.final_process(
-                self.secondary_stem_path, self.secondary_source, self.model_secondary_stem, self.sample_rate
-            )
-            output_files.append(self.secondary_stem_path)
-
-        # Save and process the primary stem if needed
-        if not self.output_single_stem or self.output_single_stem.lower() == self.model_primary_stem.lower():
-            self.logger.info(f"Saving {self.model_primary_stem} stem...")
-            if not self.primary_stem_path:
-                self.primary_stem_path = os.path.join(
-                    f"{self.audio_file_base}_({self.model_primary_stem})_{self.model_name}.{self.output_format.lower()}"
-                )
-            if not isinstance(self.primary_source, np.ndarray):
-                self.primary_source = source.T
-            self.primary_source_map = self.final_process(
-                self.primary_stem_path, self.primary_source, self.model_primary_stem, self.sample_rate
-            )
-            output_files.append(self.primary_stem_path)
+        # Run separation method for the loaded model
+        output_files = self.model_instance.separate(audio_file_path)
 
         # Clear GPU cache to free up memory
         self.clear_gpu_cache()
 
-        # TODO: In UVR, this is where the vocal split chain gets processed - see process_vocal_split_chain()
-
-        # Log the completion of the separation process
-        self.logger.debug("Separation process completed.")
-        self.logger.info(
-            f'Separation duration: {time.strftime("%H:%M:%S", time.gmtime(int(time.perf_counter() - self.separate_start_time)))}'
-        )
-
         # Unset the audio file to prevent accidental re-separation of the same file
         self.logger.debug("Clearing audio file...")
         self.audio_file_path = None
@@ -419,324 +551,11 @@ def separate(self, audio_file_path):
         self.logger.debug("Clearing sources and stems...")
         self.primary_source = None
         self.secondary_source = None
-        self.primary_stem_path = None
-        self.secondary_stem_path = None
+        self.primary_stem_output_path = None
+        self.secondary_stem_output_path = None
 
-        return output_files
-
-    def write_audio(self, stem_path: str, stem_source, sample_rate, stem_name=None):
-        self.logger.debug(f"Entering write_audio with stem_name: {stem_name} and stem_path: {stem_path}")
-
-        stem_source = spec_utils.normalize(self.logger, wave=stem_source, max_peak=self.normalization_threshold)
-
-        # Check if the numpy array is empty or contains very low values
-        if np.max(np.abs(stem_source)) < 1e-6:
-            self.logger.warning("Warning: stem_source array is near-silent or empty.")
-            return
-
-        # If output_dir is specified, create it and join it with stem_path
-        if self.output_dir:
-            os.makedirs(self.output_dir, exist_ok=True)
-            stem_path = os.path.join(self.output_dir, stem_path)
-
-        self.logger.debug(f"Audio data shape before processing: {stem_source.shape}")
-        self.logger.debug(f"Data type before conversion: {stem_source.dtype}")
-
-        # Ensure the audio data is in the correct format (e.g., int16)
-        if stem_source.dtype != np.int16:
-            stem_source = (stem_source * 32767).astype(np.int16)
-            self.logger.debug("Converted stem_source to int16.")
-
-        # Correctly interleave stereo channels
-        stem_source_interleaved = np.empty((2 * stem_source.shape[0],), dtype=np.int16)
-        stem_source_interleaved[0::2] = stem_source[:, 0]  # Left channel
-        stem_source_interleaved[1::2] = stem_source[:, 1]  # Right channel
-
-        self.logger.debug(f"Interleaved audio data shape: {stem_source_interleaved.shape}")
+        # Log the completion of the separation process
+        self.logger.debug("Separation process completed.")
+        self.logger.info(f'Separation duration: {time.strftime("%H:%M:%S", time.gmtime(int(time.perf_counter() - separate_start_time)))}')
 
-        # Create a pydub AudioSegment
-        try:
-            audio_segment = AudioSegment(
-                stem_source_interleaved.tobytes(), frame_rate=self.sample_rate, sample_width=stem_source.dtype.itemsize, channels=2
-            )
-            self.logger.debug("Created AudioSegment successfully.")
-        except Exception as e:
-            self.logger.error(f"Error creating AudioSegment: {e}")
-            return
-
-        # Determine file format based on the file extension
-        file_format = stem_path.lower().split(".")[-1]
-
-        # For m4a files, specify mp4 as the container format as the extension doesn't match the format name
-        if file_format == "m4a":
-            file_format = "mp4"
-        elif file_format == "mka":
-            file_format = "matroska"
-
-        # Export using the determined format
-        try:
-            audio_segment.export(stem_path, format=file_format)
-            self.logger.debug(f"Exported audio file successfully to {stem_path}")
-        except Exception as e:
-            self.logger.error(f"Error exporting audio file: {e}")
-
-    # This function sets up the necessary parameters for the model, like the number of frequency bins (n_bins), the trimming size (trim),
-    # the size of each audio chunk (chunk_size), and the window function for spectral transformations (window).
-    # It ensures that the model is configured with the correct settings for processing the audio data.
-    def initialize_model_settings(self):
-        self.logger.debug("Initializing model settings...")
-
-        # n_bins is half the FFT size plus one (self.n_fft // 2 + 1).
-        self.n_bins = self.n_fft // 2 + 1
-
-        # trim is half the FFT size (self.n_fft // 2).
-        self.trim = self.n_fft // 2
-
-        # chunk_size is the hop_length size times the segment size minus one
-        self.chunk_size = self.hop_length * (self.segment_size - 1)
-
-        # gen_size is the chunk size minus twice the trim size
-        self.gen_size = self.chunk_size - 2 * self.trim
-
-        self.stft = STFT(self.logger, self.n_fft, self.hop_length, self.dim_f, self.device)
-
-        self.logger.debug(f"Model input params: n_fft={self.n_fft} hop_length={self.hop_length} dim_f={self.dim_f}")
-        self.logger.debug(f"Model settings: n_bins={self.n_bins}, trim={self.trim}, chunk_size={self.chunk_size}, gen_size={self.gen_size}")
-
-    # After prepare_mix segments the audio, initialize_mix further processes each segment.
-    # It ensures each audio segment is in the correct format for the model, applies necessary padding,
-    # and converts the segments into tensors for processing with the model.
-    # This step is essential for preparing the audio data in a format that the neural network can process.
-    def initialize_mix(self, mix, is_ckpt=False):
-        # Log the initialization of the mix and whether checkpoint mode is used
-        self.logger.debug(f"Initializing mix with is_ckpt={is_ckpt}. Initial mix shape: {mix.shape}")
-
-        # Ensure the mix is a 2-channel (stereo) audio signal
-        if mix.shape[0] != 2:
-            error_message = f"Expected a 2-channel audio signal, but got {mix.shape[0]} channels"
-            self.logger.error(error_message)
-            raise ValueError(error_message)
-
-        # If in checkpoint mode, process the mix differently
-        if is_ckpt:
-            self.logger.debug("Processing in checkpoint mode...")
-            # Calculate padding based on the generation size and trim
-            pad = self.gen_size + self.trim - (mix.shape[-1] % self.gen_size)
-            self.logger.debug(f"Padding calculated: {pad}")
-            # Add padding at the beginning and the end of the mix
-            mixture = np.concatenate((np.zeros((2, self.trim), dtype="float32"), mix, np.zeros((2, pad), dtype="float32")), 1)
-            # Determine the number of chunks based on the mixture's length
-            num_chunks = mixture.shape[-1] // self.gen_size
-            self.logger.debug(f"Mixture shape after padding: {mixture.shape}, Number of chunks: {num_chunks}")
-            # Split the mixture into chunks
-            mix_waves = [mixture[:, i * self.gen_size : i * self.gen_size + self.chunk_size] for i in range(num_chunks)]
-        else:
-            # If not in checkpoint mode, process normally
-            self.logger.debug("Processing in non-checkpoint mode...")
-            mix_waves = []
-            n_sample = mix.shape[1]
-            # Calculate necessary padding to make the total length divisible by the generation size
-            pad = self.gen_size - n_sample % self.gen_size
-            self.logger.debug(f"Number of samples: {n_sample}, Padding calculated: {pad}")
-            # Apply padding to the mix
-            mix_p = np.concatenate((np.zeros((2, self.trim)), mix, np.zeros((2, pad)), np.zeros((2, self.trim))), 1)
-            self.logger.debug(f"Shape of mix after padding: {mix_p.shape}")
-
-            # Process the mix in chunks
-            i = 0
-            while i < n_sample + pad:
-                waves = np.array(mix_p[:, i : i + self.chunk_size])
-                mix_waves.append(waves)
-                self.logger.debug(f"Processed chunk {len(mix_waves)}: Start {i}, End {i + self.chunk_size}")
-                i += self.gen_size
-
-        # Convert the list of wave chunks into a tensor for processing on the specified device
-        mix_waves_tensor = torch.tensor(mix_waves, dtype=torch.float32).to(self.device)
-        self.logger.debug(f"Converted mix_waves to tensor. Tensor shape: {mix_waves_tensor.shape}")
-
-        return mix_waves_tensor, pad
-
-    def demix(self, mix, is_match_mix=False):
-        self.logger.debug(f"Starting demixing process with is_match_mix: {is_match_mix}...")
-        self.initialize_model_settings()
-
-        # Preserves the original mix for later use.
-        # In UVR, this is used for the pitch fix and VR denoise processes, which aren't yet implemented here.
-        org_mix = mix
-        self.logger.debug(f"Original mix stored. Shape: {org_mix.shape}")
-
-        # Initializes a list to store the separated waveforms.
-        tar_waves_ = []
-
-        # Handling different chunk sizes and overlaps based on the matching requirement.
-        if is_match_mix:
-            # Sets a smaller chunk size specifically for matching the mix.
-            chunk_size = self.hop_length * (self.segment_size - 1)
-            # Sets a small overlap for the chunks.
-            overlap = 0.02
-            self.logger.debug(f"Chunk size for matching mix: {chunk_size}, Overlap: {overlap}")
-        else:
-            # Uses the regular chunk size defined in model settings.
-            chunk_size = self.chunk_size
-            # Uses the overlap specified in the model settings.
-            overlap = self.overlap
-            self.logger.debug(f"Standard chunk size: {chunk_size}, Overlap: {overlap}")
-
-        # Calculates the generated size after subtracting the trim from both ends of the chunk.
-        gen_size = chunk_size - 2 * self.trim
-        self.logger.debug(f"Generated size calculated: {gen_size}")
-
-        # Calculates padding to make the mix length a multiple of the generated size.
-        pad = gen_size + self.trim - ((mix.shape[-1]) % gen_size)
-        # Prepares the mixture with padding at the beginning and the end.
-        mixture = np.concatenate((np.zeros((2, self.trim), dtype="float32"), mix, np.zeros((2, pad), dtype="float32")), 1)
-        self.logger.debug(f"Mixture prepared with padding. Mixture shape: {mixture.shape}")
-
-        # Calculates the step size for processing chunks based on the overlap.
-        step = int((1 - overlap) * chunk_size)
-        self.logger.debug(f"Step size for processing chunks: {step} as overlap is set to {overlap}.")
-
-        # Initializes arrays to store the results and to account for overlap.
-        result = np.zeros((1, 2, mixture.shape[-1]), dtype=np.float32)
-        divider = np.zeros((1, 2, mixture.shape[-1]), dtype=np.float32)
-
-        # Initializes counters for processing chunks.
-        total = 0
-        total_chunks = (mixture.shape[-1] + step - 1) // step
-        self.logger.debug(f"Total chunks to process: {total_chunks}")
-
-        # Processes each chunk of the mixture.
-        for i in tqdm(range(0, mixture.shape[-1], step),desc="Processing chunk"):
-            total += 1
-            start = i
-            end = min(i + chunk_size, mixture.shape[-1])
-            self.logger.debug(f"Processing chunk {total}/{total_chunks}: Start {start}, End {end}")
-
-            # Handles windowing for overlapping chunks.
-            chunk_size_actual = end - start
-            window = None
-            if overlap != 0:
-                window = np.hanning(chunk_size_actual)
-                window = np.tile(window[None, None, :], (1, 2, 1))
-                self.logger.debug("Window applied to the chunk.")
-
-            # Zero-pad the chunk to prepare it for processing.
-            mix_part_ = mixture[:, start:end]
-            if end != i + chunk_size:
-                pad_size = (i + chunk_size) - end
-                mix_part_ = np.concatenate((mix_part_, np.zeros((2, pad_size), dtype="float32")), axis=-1)
-
-            # Converts the chunk to a tensor for processing.
-            mix_part = torch.tensor([mix_part_], dtype=torch.float32).to(self.device)
-            # Splits the chunk into smaller batches if necessary.
-            mix_waves = mix_part.split(self.batch_size)
-            total_batches = len(mix_waves)
-            self.logger.debug(f"Mix part split into batches. Number of batches: {total_batches}")
-
-            with torch.no_grad():
-                # Processes each batch in the chunk.
-                batches_processed = 0
-                for mix_wave in mix_waves:
-                    batches_processed += 1
-                    self.logger.debug(f"Processing mix_wave batch {batches_processed}/{total_batches}")
-
-                    # Runs the model to separate the sources.
-                    tar_waves = self.run_model(mix_wave, is_match_mix=is_match_mix)
-
-                    # Applies windowing if needed and accumulates the results.
-                    if window is not None:
-                        tar_waves[..., :chunk_size_actual] *= window
-                        divider[..., start:end] += window
-                    else:
-                        divider[..., start:end] += 1
-
-                    result[..., start:end] += tar_waves[..., : end - start]
-
-        # Normalizes the results by the divider to account for overlap.
-        self.logger.debug("Normalizing result by dividing result by divider.")
-        tar_waves = result / divider
-        tar_waves_.append(tar_waves)
-
-        # Reshapes the results to match the original dimensions.
-        tar_waves_ = np.vstack(tar_waves_)[:, :, self.trim : -self.trim]
-        tar_waves = np.concatenate(tar_waves_, axis=-1)[:, : mix.shape[-1]]
-
-        # Extracts the source from the results.
-        source = tar_waves[:, 0:None]
-        self.logger.debug(f"Concatenated tar_waves. Shape: {tar_waves.shape}")
-
-        # TODO: In UVR, pitch changing happens here. Consider implementing this as a feature.
-
-        # Compensates the source if not matching the mix.
-        if not is_match_mix:
-            source * self.compensate
-            self.logger.debug("Match mix mode; compensate multiplier applied.")
-
-        # TODO: In UVR, VR denoise model gets applied here. Consider implementing this as a feature.
-
-        self.logger.debug("Demixing process completed.")
-        return source
-
-    def run_model(self, mix, is_match_mix=False):
-        # Applying the STFT to the mix. The mix is moved to the specified device (e.g., GPU) before processing.
-        # self.logger.debug(f"Running STFT on the mix. Mix shape before STFT: {mix.shape}")
-        spek = self.stft(mix.to(self.device))
-        self.logger.debug(f"STFT applied on mix. Spectrum shape: {spek.shape}")
-
-        # Zeroing out the first 3 bins of the spectrum. This is often done to reduce low-frequency noise.
-        spek[:, :, :3, :] *= 0
-        # self.logger.debug("First 3 bins of the spectrum zeroed out.")
-
-        # Handling the case where the mix needs to be matched (is_match_mix = True)
-        if is_match_mix:
-            # self.logger.debug("Match mix mode is enabled. Converting spectrum to NumPy array.")
-            spec_pred = spek.cpu().numpy()
-            self.logger.debug("is_match_mix: spectrum prediction obtained directly from STFT output.")
-        else:
-            # If denoising is enabled, the model is run on both the negative and positive spectrums.
-            if self.denoise_enabled:
-                spec_pred = -self.model_run(-spek) * 0.5 + self.model_run(spek) * 0.5
-                self.logger.debug("Model run on both negative and positive spectrums for denoising.")
-            else:
-                spec_pred = self.model_run(spek)
-                self.logger.debug("Model run on the spectrum without denoising.")
-
-        # Applying the inverse STFT to convert the spectrum back to the time domain.
-        result = self.stft.inverse(torch.tensor(spec_pred).to(self.device)).cpu().detach().numpy()
-        self.logger.debug(f"Inverse STFT applied. Returning result with shape: {result.shape}")
-
-        return result
-
-    def prepare_mix(self, mix):
-        # Store the original path or the mix itself for later checks
-        audio_path = mix
-
-        # Check if the input is a file path (string) and needs to be loaded
-        if not isinstance(mix, np.ndarray):
-            self.logger.debug(f"Loading audio from file: {mix}")
-            mix, sr = librosa.load(mix, mono=False, sr=self.sample_rate)
-            self.logger.debug(f"Audio loaded. Sample rate: {sr}, Audio shape: {mix.shape}")
-        else:
-            # Transpose the mix if it's already an ndarray (expected shape: [channels, samples])
-            self.logger.debug("Transposing the provided mix array.")
-            mix = mix.T
-            self.logger.debug(f"Transposed mix shape: {mix.shape}")
-
-        # If the original input was a filepath, check if the loaded mix is empty
-        if isinstance(audio_path, str):
-            if not np.any(mix):
-                error_msg = f"Audio file {audio_path} is empty or not valid"
-                self.logger.error(error_msg)
-                raise ValueError(error_msg)
-            else:
-                self.logger.debug("Audio file is valid and contains data.")
-
-        # Ensure the mix is in stereo format
-        if mix.ndim == 1:
-            self.logger.debug("Mix is mono. Converting to stereo.")
-            mix = np.asfortranarray([mix, mix])
-            self.logger.debug("Converted to stereo mix.")
-
-        # Final log indicating successful preparation of the mix
-        self.logger.debug("Mix preparation completed.")
-        return mix
+        return output_files
diff --git a/audio_separator/separator/spec_utils.py b/audio_separator/separator/spec_utils.py
deleted file mode 100644
index bf1c954..0000000
--- a/audio_separator/separator/spec_utils.py
+++ /dev/null
@@ -1,687 +0,0 @@
-import librosa
-import numpy as np
-import soundfile as sf
-import math
-import random
-import math
-import platform
-import logging
-
-OPERATING_SYSTEM = platform.system()
-SYSTEM_ARCH = platform.platform()
-SYSTEM_PROC = platform.processor()
-ARM = "arm"
-
-if OPERATING_SYSTEM == "Darwin":
-    wav_resolution = "polyphase" if SYSTEM_PROC == ARM or ARM in SYSTEM_ARCH else "sinc_fastest"
-else:
-    wav_resolution = "sinc_fastest"
-
-MAX_SPEC = "Max Spec"
-MIN_SPEC = "Min Spec"
-AVERAGE = "Average"
-
-
-def crop_center(h1, h2):
-    """
-    This function crops the center of the first input tensor to match the size of the second input tensor.
-    It is used to ensure that the two tensors have the same size in the time dimension.
-    """
-    h1_shape = h1.size()
-    h2_shape = h2.size()
-
-    # If the time dimensions are already equal, return the first tensor as is
-    if h1_shape[3] == h2_shape[3]:
-        return h1
-    # If the time dimension of the first tensor is smaller, raise an error
-    elif h1_shape[3] < h2_shape[3]:
-        raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
-
-    # Calculate the start and end indices for cropping
-    s_time = (h1_shape[3] - h2_shape[3]) // 2
-    e_time = s_time + h2_shape[3]
-    # Crop the first tensor
-    h1 = h1[:, :, :, s_time:e_time]
-
-    return h1
-
-
-def preprocess(X_spec):
-    """
-    This function preprocesses a spectrogram by separating it into magnitude and phase components.
-    This is a common preprocessing step in audio processing tasks.
-    """
-    X_mag = np.abs(X_spec)
-    X_phase = np.angle(X_spec)
-
-    return X_mag, X_phase
-
-
-def make_padding(width, cropsize, offset):
-    """
-    This function calculates the padding needed to make the width of an image divisible by the crop size.
-    It is used in the process of splitting an image into smaller patches.
-    """
-    left = offset
-    roi_size = cropsize - offset * 2
-    if roi_size == 0:
-        roi_size = cropsize
-    right = roi_size - (width % roi_size) + left
-
-    return left, right, roi_size
-
-
-def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
-    """
-    This function converts a stereo audio waveform into a spectrogram.
-    It supports several options for processing the stereo channels, such as mid-side processing and reversing.
-    """
-    # Process the stereo channels based on the provided options
-    if reverse:
-        wave_left = np.flip(np.asfortranarray(wave[0]))
-        wave_right = np.flip(np.asfortranarray(wave[1]))
-    elif mid_side:
-        wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
-        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
-    elif mid_side_b2:
-        wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
-        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
-    else:
-        wave_left = np.asfortranarray(wave[0])
-        wave_right = np.asfortranarray(wave[1])
-
-    # Compute the spectrogram for each channel
-    spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length)
-    spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
-
-    # Combine the spectrograms into a single array
-    spec = np.asfortranarray([spec_left, spec_right])
-
-    return spec
-
-
-def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
-    """
-    This function is similar to wave_to_spectrogram, but it uses multithreading to compute the spectrograms for the two channels in parallel.
-    This can provide a speedup on systems with multiple cores.
-    """
-    import threading
-
-    if reverse:
-        wave_left = np.flip(np.asfortranarray(wave[0]))
-        wave_right = np.flip(np.asfortranarray(wave[1]))
-    elif mid_side:
-        wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
-        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
-    elif mid_side_b2:
-        wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
-        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
-    else:
-        wave_left = np.asfortranarray(wave[0])
-        wave_right = np.asfortranarray(wave[1])
-
-    def run_thread(**kwargs):
-        global spec_left
-        spec_left = librosa.stft(**kwargs)
-
-    # Start two threads to compute the spectrograms in parallel
-    thread = threading.Thread(target=run_thread, kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length})
-    thread.start()
-    spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
-    thread.join()
-
-    # Combine the spectrograms into a single array
-    spec = np.asfortranarray([spec_left, spec_right])
-
-    return spec
-
-
-def normalize(logger: logging.Logger, wave, max_peak=1.0):
-    """Normalize audio waveform to a specified peak value.
-
-    Args:
-        logger (logging.Logger): Logger for debugging information.
-        wave (array-like): Audio waveform.
-        max_peak (float): Maximum peak value for normalization.
-
-    Returns:
-        array-like: Normalized or original waveform.
-    """
-    maxv = np.abs(wave).max()
-    if maxv > max_peak:
-        logger.debug(f"Maximum peak amplitude above clipping threshold, normalizing from {maxv} to max peak {max_peak}.")
-        wave *= max_peak / maxv
-    else:
-        logger.debug(f"Maximum peak amplitude not above clipping threshold, no need to normalize: {maxv}")
-
-    return wave
-
-
-def normalize_two_stem(logger: logging.Logger, wave, mix, is_normalize=False):
-    """Save output music files"""
-
-    maxv = np.abs(wave).max()
-    max_mix = np.abs(mix).max()
-
-    if maxv > 1.0:
-        logger.debug(f"Normalization Set {is_normalize}: Primary source above threshold for clipping. Max:{maxv}")
-        logger.debug(f"Normalization Set {is_normalize}: Mixture above threshold for clipping. Max:{max_mix}")
-        if is_normalize:
-            logger.debug(f"The result was normalized.")
-            wave /= maxv
-            mix /= maxv
-        else:
-            logger.debug(f"The result was not normalized.")
-    else:
-        logger.debug(f"Normalization Set {is_normalize}: Input not above threshold for clipping. Max:{maxv}")
-
-    logger.debug(f"Normalization Set {is_normalize}: Primary source - Max:{np.abs(wave).max()}")
-    logger.debug(f"Normalization Set {is_normalize}: Mixture - Max:{np.abs(mix).max()}")
-
-    return wave, mix
-
-
-def combine_spectrograms(specs, mp):
-    l = min([specs[i].shape[2] for i in specs])
-    spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64)
-    offset = 0
-    bands_n = len(mp.param["band"])
-
-    for d in range(1, bands_n + 1):
-        h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"]
-        spec_c[:, offset : offset + h, :l] = specs[d][:, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l]
-        offset += h
-
-    if offset > mp.param["bins"]:
-        raise ValueError("Too much bins")
-
-    # lowpass fiter
-    if mp.param["pre_filter_start"] > 0:  # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
-        if bands_n == 1:
-            spec_c = fft_lp_filter(spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"])
-        else:
-            gp = 1
-            for b in range(mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]):
-                g = math.pow(10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0)
-                gp = g
-                spec_c[:, b, :] *= g
-
-    return np.asfortranarray(spec_c)
-
-
-def spectrogram_to_image(spec, mode="magnitude"):
-    if mode == "magnitude":
-        if np.iscomplexobj(spec):
-            y = np.abs(spec)
-        else:
-            y = spec
-        y = np.log10(y**2 + 1e-8)
-    elif mode == "phase":
-        if np.iscomplexobj(spec):
-            y = np.angle(spec)
-        else:
-            y = spec
-
-    y -= y.min()
-    y *= 255 / y.max()
-    img = np.uint8(y)
-
-    if y.ndim == 3:
-        img = img.transpose(1, 2, 0)
-        img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2)
-
-    return img
-
-
-def reduce_vocal_aggressively(X, y, softmask):
-    v = X - y
-    y_mag_tmp = np.abs(y)
-    v_mag_tmp = np.abs(v)
-
-    v_mask = v_mag_tmp > y_mag_tmp
-    y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
-
-    return y_mag * np.exp(1.0j * np.angle(y))
-
-
-def align_wave_head_and_tail(a, b):
-    l = min([a[0].size, b[0].size])
-
-    return a[:l, :l], b[:l, :l]
-
-
-def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse, clamp=False):
-    spec_left = np.asfortranarray(spec[0])
-    spec_right = np.asfortranarray(spec[1])
-
-    wave_left = librosa.istft(spec_left, hop_length=hop_length)
-    wave_right = librosa.istft(spec_right, hop_length=hop_length)
-
-    if reverse:
-        return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
-    elif mid_side:
-        return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
-    elif mid_side_b2:
-        return np.asfortranarray([np.add(wave_right / 1.25, 0.4 * wave_left), np.subtract(wave_left / 1.25, 0.4 * wave_right)])
-    else:
-        return np.asfortranarray([wave_left, wave_right])
-
-
-def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
-    import threading
-
-    spec_left = np.asfortranarray(spec[0])
-    spec_right = np.asfortranarray(spec[1])
-
-    def run_thread(**kwargs):
-        global wave_left
-        wave_left = librosa.istft(**kwargs)
-
-    thread = threading.Thread(target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length})
-    thread.start()
-    wave_right = librosa.istft(spec_right, hop_length=hop_length)
-    thread.join()
-
-    if reverse:
-        return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
-    elif mid_side:
-        return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
-    elif mid_side_b2:
-        return np.asfortranarray([np.add(wave_right / 1.25, 0.4 * wave_left), np.subtract(wave_left / 1.25, 0.4 * wave_right)])
-    else:
-        return np.asfortranarray([wave_left, wave_right])
-
-
-def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
-    bands_n = len(mp.param["band"])
-    offset = 0
-
-    for d in range(1, bands_n + 1):
-        bp = mp.param["band"][d]
-        spec_s = np.ndarray(shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex)
-        h = bp["crop_stop"] - bp["crop_start"]
-        spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[:, offset : offset + h, :]
-
-        offset += h
-        if d == bands_n:  # higher
-            if extra_bins_h:  # if --high_end_process bypass
-                max_bin = bp["n_fft"] // 2
-                spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[:, :extra_bins_h, :]
-            if bp["hpf_start"] > 0:
-                spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
-            if bands_n == 1:
-                wave = spectrogram_to_wave(spec_s, bp["hl"], mp.param["mid_side"], mp.param["mid_side_b2"], mp.param["reverse"])
-            else:
-                wave = np.add(
-                    wave, spectrogram_to_wave(spec_s, bp["hl"], mp.param["mid_side"], mp.param["mid_side_b2"], mp.param["reverse"])
-                )
-        else:
-            sr = mp.param["band"][d + 1]["sr"]
-            if d == 1:  # lower
-                spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
-                wave = librosa.resample(
-                    spectrogram_to_wave(spec_s, bp["hl"], mp.param["mid_side"], mp.param["mid_side_b2"], mp.param["reverse"]),
-                    bp["sr"],
-                    sr,
-                    res_type=wav_resolution,
-                )
-            else:  # mid
-                spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
-                spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
-                wave2 = np.add(
-                    wave, spectrogram_to_wave(spec_s, bp["hl"], mp.param["mid_side"], mp.param["mid_side_b2"], mp.param["reverse"])
-                )
-                wave = librosa.resample(wave2, bp["sr"], sr, res_type=wav_resolution)
-
-    return wave
-
-
-def fft_lp_filter(spec, bin_start, bin_stop):
-    g = 1.0
-    for b in range(bin_start, bin_stop):
-        g -= 1 / (bin_stop - bin_start)
-        spec[:, b, :] = g * spec[:, b, :]
-
-    spec[:, bin_stop:, :] *= 0
-
-    return spec
-
-
-def fft_hp_filter(spec, bin_start, bin_stop):
-    g = 1.0
-    for b in range(bin_start, bin_stop, -1):
-        g -= 1 / (bin_start - bin_stop)
-        spec[:, b, :] = g * spec[:, b, :]
-
-    spec[:, 0 : bin_stop + 1, :] *= 0
-
-    return spec
-
-
-def mirroring(a, spec_m, input_high_end, mp):
-    if "mirroring" == a:
-        mirror = np.flip(
-            np.abs(spec_m[:, mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :]), 1
-        )
-        mirror = mirror * np.exp(1.0j * np.angle(input_high_end))
-
-        return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror)
-
-    if "mirroring2" == a:
-        mirror = np.flip(
-            np.abs(spec_m[:, mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :]), 1
-        )
-        mi = np.multiply(mirror, input_high_end * 1.7)
-
-        return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
-
-
-def adjust_aggr(mask, is_non_accom_stem, aggressiveness):
-    aggr = aggressiveness["value"]
-
-    if aggr != 0:
-        if is_non_accom_stem:
-            aggr = 1 - aggr
-
-        aggr = [aggr, aggr]
-
-        if aggressiveness["aggr_correction"] is not None:
-            aggr[0] += aggressiveness["aggr_correction"]["left"]
-            aggr[1] += aggressiveness["aggr_correction"]["right"]
-
-        for ch in range(2):
-            mask[ch, : aggressiveness["split_bin"]] = np.power(mask[ch, : aggressiveness["split_bin"]], 1 + aggr[ch] / 3)
-            mask[ch, aggressiveness["split_bin"] :] = np.power(mask[ch, aggressiveness["split_bin"] :], 1 + aggr[ch])
-
-        # if is_non_accom_stem:
-        #     mask = (1.0 - mask)
-
-    return mask
-
-
-def stft(wave, nfft, hl):
-    wave_left = np.asfortranarray(wave[0])
-    wave_right = np.asfortranarray(wave[1])
-    spec_left = librosa.stft(wave_left, nfft, hop_length=hl)
-    spec_right = librosa.stft(wave_right, nfft, hop_length=hl)
-    spec = np.asfortranarray([spec_left, spec_right])
-
-    return spec
-
-
-def istft(spec, hl):
-    spec_left = np.asfortranarray(spec[0])
-    spec_right = np.asfortranarray(spec[1])
-    wave_left = librosa.istft(spec_left, hop_length=hl)
-    wave_right = librosa.istft(spec_right, hop_length=hl)
-    wave = np.asfortranarray([wave_left, wave_right])
-
-    return wave
-
-
-def spec_effects(wave, algorithm="Default", value=None):
-    spec = [stft(wave[0], 2048, 1024), stft(wave[1], 2048, 1024)]
-    if algorithm == "Min_Mag":
-        v_spec_m = np.where(np.abs(spec[1]) <= np.abs(spec[0]), spec[1], spec[0])
-        wave = istft(v_spec_m, 1024)
-    elif algorithm == "Max_Mag":
-        v_spec_m = np.where(np.abs(spec[1]) >= np.abs(spec[0]), spec[1], spec[0])
-        wave = istft(v_spec_m, 1024)
-    elif algorithm == "Default":
-        wave = (wave[1] * value) + (wave[0] * (1 - value))
-    elif algorithm == "Invert_p":
-        X_mag = np.abs(spec[0])
-        y_mag = np.abs(spec[1])
-        max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
-        v_spec = spec[1] - max_mag * np.exp(1.0j * np.angle(spec[0]))
-        wave = istft(v_spec, 1024)
-
-    return wave
-
-
-def spectrogram_to_wave_no_mp(spec, n_fft=2048, hop_length=1024):
-    wave = librosa.istft(spec, n_fft=n_fft, hop_length=hop_length)
-
-    if wave.ndim == 1:
-        wave = np.asfortranarray([wave, wave])
-
-    return wave
-
-
-def wave_to_spectrogram_no_mp(wave):
-    spec = librosa.stft(wave, n_fft=2048, hop_length=1024)
-
-    if spec.ndim == 1:
-        spec = np.asfortranarray([spec, spec])
-
-    return spec
-
-
-def invert_audio(specs, invert_p=True):
-    ln = min([specs[0].shape[2], specs[1].shape[2]])
-    specs[0] = specs[0][:, :, :ln]
-    specs[1] = specs[1][:, :, :ln]
-
-    if invert_p:
-        X_mag = np.abs(specs[0])
-        y_mag = np.abs(specs[1])
-        max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
-        v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0]))
-    else:
-        specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
-        v_spec = specs[0] - specs[1]
-
-    return v_spec
-
-
-def invert_stem(mixture, stem):
-    mixture = wave_to_spectrogram_no_mp(mixture)
-    stem = wave_to_spectrogram_no_mp(stem)
-    output = spectrogram_to_wave_no_mp(invert_audio([mixture, stem]))
-
-    return -output.T
-
-
-def ensembling(a, specs):
-    for i in range(1, len(specs)):
-        if i == 1:
-            spec = specs[0]
-
-        ln = min([spec.shape[2], specs[i].shape[2]])
-        spec = spec[:, :, :ln]
-        specs[i] = specs[i][:, :, :ln]
-
-        if MIN_SPEC == a:
-            spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
-        if MAX_SPEC == a:
-            spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
-        if AVERAGE == a:
-            spec = np.where(np.abs(specs[i]) == np.abs(spec), specs[i], spec)
-
-    return spec
-
-
-def ensemble_inputs(audio_input, algorithm, is_normalization, wav_type_set, save_path):
-    wavs_ = []
-
-    if algorithm == AVERAGE:
-        output = average_audio(audio_input)
-        samplerate = 44100
-    else:
-        specs = []
-
-        for i in range(len(audio_input)):
-            wave, samplerate = librosa.load(audio_input[i], mono=False, sr=44100)
-            wavs_.append(wave)
-            spec = wave_to_spectrogram_no_mp(wave)
-            specs.append(spec)
-
-        wave_shapes = [w.shape[1] for w in wavs_]
-        target_shape = wavs_[wave_shapes.index(max(wave_shapes))]
-
-        output = spectrogram_to_wave_no_mp(ensembling(algorithm, specs))
-        output = to_shape(output, target_shape.shape)
-
-    sf.write(save_path, normalize(output.T, is_normalization), samplerate, subtype=wav_type_set)
-
-
-def to_shape(x, target_shape):
-    padding_list = []
-    for x_dim, target_dim in zip(x.shape, target_shape):
-        pad_value = target_dim - x_dim
-        pad_tuple = (0, pad_value)
-        padding_list.append(pad_tuple)
-
-    return np.pad(x, tuple(padding_list), mode="constant")
-
-
-def to_shape_minimize(x: np.ndarray, target_shape):
-    padding_list = []
-    for x_dim, target_dim in zip(x.shape, target_shape):
-        pad_value = target_dim - x_dim
-        pad_tuple = (0, pad_value)
-        padding_list.append(pad_tuple)
-
-    return np.pad(x, tuple(padding_list), mode="constant")
-
-
-def average_audio(audio):
-    waves = []
-    wave_shapes = []
-    final_waves = []
-
-    for i in range(len(audio)):
-        wave = librosa.load(audio[i], sr=44100, mono=False)
-        waves.append(wave[0])
-        wave_shapes.append(wave[0].shape[1])
-
-    wave_shapes_index = wave_shapes.index(max(wave_shapes))
-    target_shape = waves[wave_shapes_index]
-    waves.pop(wave_shapes_index)
-    final_waves.append(target_shape)
-
-    for n_array in waves:
-        wav_target = to_shape(n_array, target_shape.shape)
-        final_waves.append(wav_target)
-
-    waves = sum(final_waves)
-    waves = waves / len(audio)
-
-    return waves
-
-
-def average_dual_sources(wav_1, wav_2, value):
-    if wav_1.shape > wav_2.shape:
-        wav_2 = to_shape(wav_2, wav_1.shape)
-    if wav_1.shape < wav_2.shape:
-        wav_1 = to_shape(wav_1, wav_2.shape)
-
-    wave = (wav_1 * value) + (wav_2 * (1 - value))
-
-    return wave
-
-
-def reshape_sources(wav_1: np.ndarray, wav_2: np.ndarray):
-    if wav_1.shape > wav_2.shape:
-        wav_2 = to_shape(wav_2, wav_1.shape)
-    if wav_1.shape < wav_2.shape:
-        ln = min([wav_1.shape[1], wav_2.shape[1]])
-        wav_2 = wav_2[:, :ln]
-
-    ln = min([wav_1.shape[1], wav_2.shape[1]])
-    wav_1 = wav_1[:, :ln]
-    wav_2 = wav_2[:, :ln]
-
-    return wav_2
-
-
-def align_audio(
-    file1, file2, file2_aligned, file_subtracted, wav_type_set, is_normalization, command_Text, progress_bar_main_var, save_format
-):
-    def get_diff(a, b):
-        corr = np.correlate(a, b, "full")
-        diff = corr.argmax() - (b.shape[0] - 1)
-        return diff
-
-    progress_bar_main_var.set(10)
-
-    # read tracks
-    wav1, sr1 = librosa.load(file1, sr=44100, mono=False)
-    wav2, sr2 = librosa.load(file2, sr=44100, mono=False)
-    wav1 = wav1.transpose()
-    wav2 = wav2.transpose()
-
-    command_Text(f"Audio file shapes: {wav1.shape} / {wav2.shape}\n")
-
-    wav2_org = wav2.copy()
-    progress_bar_main_var.set(20)
-
-    command_Text("Processing files... \n")
-
-    # pick random position and get diff
-
-    counts = {}  # counting up for each diff value
-    progress = 20
-
-    check_range = 64
-
-    base = 64 / check_range
-
-    for i in range(check_range):
-        index = int(random.uniform(44100 * 2, min(wav1.shape[0], wav2.shape[0]) - 44100 * 2))
-        shift = int(random.uniform(-22050, +22050))
-        samp1 = wav1[index : index + 44100, 0]  # currently use left channel
-        samp2 = wav2[index + shift : index + shift + 44100, 0]
-        progress += 1 * base
-        progress_bar_main_var.set(progress)
-        diff = get_diff(samp1, samp2)
-        diff -= shift
-
-    if abs(diff) < 22050:
-        if not diff in counts:
-            counts[diff] = 0
-        counts[diff] += 1
-
-    # use max counted diff value
-    max_count = 0
-    est_diff = 0
-    for diff in counts.keys():
-        if counts[diff] > max_count:
-            max_count = counts[diff]
-            est_diff = diff
-
-    command_Text(f"Estimated difference is {est_diff} (count: {max_count})\n")
-
-    progress_bar_main_var.set(90)
-
-    audio_files = []
-
-    def save_aligned_audio(wav2_aligned):
-        command_Text(f"Aligned File 2 with File 1.\n")
-        command_Text(f"Saving files... ")
-        sf.write(file2_aligned, normalize(wav2_aligned, is_normalization), sr2, subtype=wav_type_set)
-        save_format(file2_aligned)
-        min_len = min(wav1.shape[0], wav2_aligned.shape[0])
-        wav_sub = wav1[:min_len] - wav2_aligned[:min_len]
-        audio_files.append(file2_aligned)
-        return min_len, wav_sub
-
-    # make aligned track 2
-    if est_diff > 0:
-        wav2_aligned = np.append(np.zeros((est_diff, 2)), wav2_org, axis=0)
-        min_len, wav_sub = save_aligned_audio(wav2_aligned)
-    elif est_diff < 0:
-        wav2_aligned = wav2_org[-est_diff:]
-        min_len, wav_sub = save_aligned_audio(wav2_aligned)
-    else:
-        command_Text(f"Audio files already aligned.\n")
-        command_Text(f"Saving inverted track... ")
-        min_len = min(wav1.shape[0], wav2.shape[0])
-        wav_sub = wav1[:min_len] - wav2[:min_len]
-
-    wav_sub = np.clip(wav_sub, -1, +1)
-
-    sf.write(file_subtracted, normalize(wav_sub, is_normalization), sr1, subtype=wav_type_set)
-    save_format(file_subtracted)
-
-    progress_bar_main_var.set(95)
diff --git a/audio_separator/separator/uvr_lib_v5/mdxnet.py b/audio_separator/separator/uvr_lib_v5/mdxnet.py
new file mode 100644
index 0000000..3293c89
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/mdxnet.py
@@ -0,0 +1,136 @@
+import torch
+import torch.nn as nn
+from .modules import TFC_TDF
+from pytorch_lightning import LightningModule
+
+dim_s = 4
+
+class AbstractMDXNet(LightningModule):
+    def __init__(self, target_name, lr, optimizer, dim_c, dim_f, dim_t, n_fft, hop_length, overlap):
+        super().__init__()
+        self.target_name = target_name
+        self.lr = lr
+        self.optimizer = optimizer
+        self.dim_c = dim_c
+        self.dim_f = dim_f
+        self.dim_t = dim_t
+        self.n_fft = n_fft
+        self.n_bins = n_fft // 2 + 1
+        self.hop_length = hop_length
+        self.window = nn.Parameter(torch.hann_window(window_length=self.n_fft, periodic=True), requires_grad=False)
+        self.freq_pad = nn.Parameter(torch.zeros([1, dim_c, self.n_bins - self.dim_f, self.dim_t]), requires_grad=False)
+
+    def get_optimizer(self):
+        if self.optimizer == 'rmsprop':
+            return torch.optim.RMSprop(self.parameters(), self.lr)
+        
+        if self.optimizer == 'adamw':
+            return torch.optim.AdamW(self.parameters(), self.lr)
+
+class ConvTDFNet(AbstractMDXNet):
+    def __init__(self, target_name, lr, optimizer, dim_c, dim_f, dim_t, n_fft, hop_length,
+                 num_blocks, l, g, k, bn, bias, overlap):
+
+        super(ConvTDFNet, self).__init__(
+            target_name, lr, optimizer, dim_c, dim_f, dim_t, n_fft, hop_length, overlap)
+        #self.save_hyperparameters()
+
+        self.num_blocks = num_blocks
+        self.l = l
+        self.g = g
+        self.k = k
+        self.bn = bn
+        self.bias = bias
+
+        if optimizer == 'rmsprop':
+            norm = nn.BatchNorm2d
+            
+        if optimizer == 'adamw':
+            norm = lambda input:nn.GroupNorm(2, input)
+            
+        self.n = num_blocks // 2
+        scale = (2, 2)
+
+        self.first_conv = nn.Sequential(
+            nn.Conv2d(in_channels=self.dim_c, out_channels=g, kernel_size=(1, 1)),
+            norm(g),
+            nn.ReLU(),
+        )
+
+        f = self.dim_f
+        c = g
+        self.encoding_blocks = nn.ModuleList()
+        self.ds = nn.ModuleList()
+        for i in range(self.n):
+            self.encoding_blocks.append(TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm))
+            self.ds.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channels=c, out_channels=c + g, kernel_size=scale, stride=scale),
+                    norm(c + g),
+                    nn.ReLU()
+                )
+            )
+            f = f // 2
+            c += g
+
+        self.bottleneck_block = TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm)
+
+        self.decoding_blocks = nn.ModuleList()
+        self.us = nn.ModuleList()
+        for i in range(self.n):
+            self.us.append(
+                nn.Sequential(
+                    nn.ConvTranspose2d(in_channels=c, out_channels=c - g, kernel_size=scale, stride=scale),
+                    norm(c - g),
+                    nn.ReLU()
+                )
+            )
+            f = f * 2
+            c -= g
+
+            self.decoding_blocks.append(TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm))
+
+        self.final_conv = nn.Sequential(
+            nn.Conv2d(in_channels=c, out_channels=self.dim_c, kernel_size=(1, 1)),
+        )
+
+    def forward(self, x):
+
+        x = self.first_conv(x)
+
+        x = x.transpose(-1, -2)
+
+        ds_outputs = []
+        for i in range(self.n):
+            x = self.encoding_blocks[i](x)
+            ds_outputs.append(x)
+            x = self.ds[i](x)
+
+        x = self.bottleneck_block(x)
+
+        for i in range(self.n):
+            x = self.us[i](x)
+            x *= ds_outputs[-i - 1]
+            x = self.decoding_blocks[i](x)
+
+        x = x.transpose(-1, -2)
+
+        x = self.final_conv(x)
+
+        return x
+    
+class Mixer(nn.Module):
+    def __init__(self, device, mixer_path):
+        
+        super(Mixer, self).__init__()
+        
+        self.linear = nn.Linear((dim_s+1)*2, dim_s*2, bias=False)
+        
+        self.load_state_dict(
+            torch.load(mixer_path, map_location=device)
+        )
+
+    def forward(self, x):
+        x = x.reshape(1,(dim_s+1)*2,-1).transpose(-1,-2)
+        x = self.linear(x)
+        return x.transpose(-1,-2).reshape(dim_s,2,-1)
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/mixer.ckpt b/audio_separator/separator/uvr_lib_v5/mixer.ckpt
new file mode 100644
index 0000000..986cc4d
Binary files /dev/null and b/audio_separator/separator/uvr_lib_v5/mixer.ckpt differ
diff --git a/audio_separator/separator/uvr_lib_v5/modules.py b/audio_separator/separator/uvr_lib_v5/modules.py
new file mode 100644
index 0000000..4e77d2f
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/modules.py
@@ -0,0 +1,74 @@
+import torch
+import torch.nn as nn
+
+
+class TFC(nn.Module):
+    def __init__(self, c, l, k, norm):
+        super(TFC, self).__init__()
+
+        self.H = nn.ModuleList()
+        for i in range(l):
+            self.H.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channels=c, out_channels=c, kernel_size=k, stride=1, padding=k // 2),
+                    norm(c),
+                    nn.ReLU(),
+                )
+            )
+
+    def forward(self, x):
+        for h in self.H:
+            x = h(x)
+        return x
+
+
+class DenseTFC(nn.Module):
+    def __init__(self, c, l, k, norm):
+        super(DenseTFC, self).__init__()
+
+        self.conv = nn.ModuleList()
+        for i in range(l):
+            self.conv.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channels=c, out_channels=c, kernel_size=k, stride=1, padding=k // 2),
+                    norm(c),
+                    nn.ReLU(),
+                )
+            )
+
+    def forward(self, x):
+        for layer in self.conv[:-1]:
+            x = torch.cat([layer(x), x], 1)
+        return self.conv[-1](x)
+
+
+class TFC_TDF(nn.Module):
+    def __init__(self, c, l, f, k, bn, dense=False, bias=True, norm=nn.BatchNorm2d):
+
+        super(TFC_TDF, self).__init__()
+
+        self.use_tdf = bn is not None
+
+        self.tfc = DenseTFC(c, l, k, norm) if dense else TFC(c, l, k, norm)
+
+        if self.use_tdf:
+            if bn == 0:
+                self.tdf = nn.Sequential(
+                    nn.Linear(f, f, bias=bias),
+                    norm(c),
+                    nn.ReLU()
+                )
+            else:
+                self.tdf = nn.Sequential(
+                    nn.Linear(f, f // bn, bias=bias),
+                    norm(c),
+                    nn.ReLU(),
+                    nn.Linear(f // bn, f, bias=bias),
+                    norm(c),
+                    nn.ReLU()
+                )
+
+    def forward(self, x):
+        x = self.tfc(x)
+        return x + self.tdf(x) if self.use_tdf else x
+
diff --git a/audio_separator/separator/uvr_lib_v5/playsound.py b/audio_separator/separator/uvr_lib_v5/playsound.py
new file mode 100644
index 0000000..abd708e
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/playsound.py
@@ -0,0 +1,242 @@
+import logging
+logger = logging.getLogger(__name__)
+
+class PlaysoundException(Exception):
+    pass
+
+def _canonicalizePath(path):
+    """
+    Support passing in a pathlib.Path-like object by converting to str.
+    """
+    import sys
+    if sys.version_info[0] >= 3:
+        return str(path)
+    else:
+        # On earlier Python versions, str is a byte string, so attempting to
+        # convert a unicode string to str will fail. Leave it alone in this case.
+        return path
+
+def _playsoundWin(sound, block = True):
+    '''
+    Utilizes windll.winmm. Tested and known to work with MP3 and WAVE on
+    Windows 7 with Python 2.7. Probably works with more file formats.
+    Probably works on Windows XP thru Windows 10. Probably works with all
+    versions of Python.
+
+    Inspired by (but not copied from) Michael Gundlach <gundlach@gmail.com>'s mp3play:
+    https://github.com/michaelgundlach/mp3play
+
+    I never would have tried using windll.winmm without seeing his code.
+    '''
+    sound = '"' + _canonicalizePath(sound) + '"'
+
+    from ctypes import create_unicode_buffer, windll, wintypes
+    from time   import sleep
+    windll.winmm.mciSendStringW.argtypes = [wintypes.LPCWSTR, wintypes.LPWSTR, wintypes.UINT, wintypes.HANDLE]
+    windll.winmm.mciGetErrorStringW.argtypes = [wintypes.DWORD, wintypes.LPWSTR, wintypes.UINT]
+
+    def winCommand(*command):
+        bufLen = 600
+        buf = create_unicode_buffer(bufLen)
+        command = ' '.join(command)
+        errorCode = int(windll.winmm.mciSendStringW(command, buf, bufLen - 1, 0))  # use widestring version of the function
+        if errorCode:
+            errorBuffer = create_unicode_buffer(bufLen)
+            windll.winmm.mciGetErrorStringW(errorCode, errorBuffer, bufLen - 1)  # use widestring version of the function
+            exceptionMessage = ('\n    Error ' + str(errorCode) + ' for command:'
+                                '\n        ' + command +
+                                '\n    ' + errorBuffer.value)
+            logger.error(exceptionMessage)
+            raise PlaysoundException(exceptionMessage)
+        return buf.value
+
+    try:
+        logger.debug('Starting')
+        winCommand(u'open {}'.format(sound))
+        winCommand(u'play {}{}'.format(sound, ' wait' if block else ''))
+        logger.debug('Returning')
+    finally:
+        try:
+            winCommand(u'close {}'.format(sound))
+        except PlaysoundException:
+            logger.warning(u'Failed to close the file: {}'.format(sound))
+            # If it fails, there's nothing more that can be done...
+            pass
+
+def _handlePathOSX(sound):
+    sound = _canonicalizePath(sound)
+
+    if '://' not in sound:
+        if not sound.startswith('/'):
+            from os import getcwd
+            sound = getcwd() + '/' + sound
+        sound = 'file://' + sound
+
+    try:
+        # Don't double-encode it.
+        sound.encode('ascii')
+        return sound.replace(' ', '%20')
+    except UnicodeEncodeError:
+        try:
+            from urllib.parse import quote  # Try the Python 3 import first...
+        except ImportError:
+            from urllib import quote  # Try using the Python 2 import before giving up entirely...
+
+        parts = sound.split('://', 1)
+        return parts[0] + '://' + quote(parts[1].encode('utf-8')).replace(' ', '%20')
+
+
+def _playsoundOSX(sound, block = True):
+    '''
+    Utilizes AppKit.NSSound. Tested and known to work with MP3 and WAVE on
+    OS X 10.11 with Python 2.7. Probably works with anything QuickTime supports.
+    Probably works on OS X 10.5 and newer. Probably works with all versions of
+    Python.
+
+    Inspired by (but not copied from) Aaron's Stack Overflow answer here:
+    http://stackoverflow.com/a/34568298/901641
+
+    I never would have tried using AppKit.NSSound without seeing his code.
+    '''
+    try:
+        from AppKit import NSSound
+    except ImportError:
+        logger.warning("playsound could not find a copy of AppKit - falling back to using macOS's system copy.")
+        sys.path.append('/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/PyObjC')
+        from AppKit import NSSound
+
+    from Foundation import NSURL
+    from time       import sleep
+
+    sound = _handlePathOSX(sound)
+    url   = NSURL.URLWithString_(sound)
+    if not url:
+        raise PlaysoundException('Cannot find a sound with filename: ' + sound)
+
+    for i in range(5):
+        nssound = NSSound.alloc().initWithContentsOfURL_byReference_(url, True)
+        if nssound:
+            break
+        else:
+            logger.debug('Failed to load sound, although url was good... ' + sound)
+    else:
+        raise PlaysoundException('Could not load sound with filename, although URL was good... ' + sound)
+    nssound.play()
+
+    if block:
+        sleep(nssound.duration())
+
+def _playsoundNix(sound, block = True):
+    """Play a sound using GStreamer.
+
+    Inspired by this:
+    https://gstreamer.freedesktop.org/documentation/tutorials/playback/playbin-usage.html
+    """
+    sound = _canonicalizePath(sound)
+
+    # pathname2url escapes non-URL-safe characters
+    from os.path import abspath, exists
+    try:
+        from urllib.request import pathname2url
+    except ImportError:
+        # python 2
+        from urllib import pathname2url
+
+    import gi
+    gi.require_version('Gst', '1.0')
+    from gi.repository import Gst
+
+    Gst.init(None)
+
+    playbin = Gst.ElementFactory.make('playbin', 'playbin')
+    if sound.startswith(('http://', 'https://')):
+        playbin.props.uri = sound
+    else:
+        path = abspath(sound)
+        if not exists(path):
+            raise PlaysoundException(u'File not found: {}'.format(path))
+        playbin.props.uri = 'file://' + pathname2url(path)
+
+
+    set_result = playbin.set_state(Gst.State.PLAYING)
+    if set_result != Gst.StateChangeReturn.ASYNC:
+        raise PlaysoundException(
+            "playbin.set_state returned " + repr(set_result))
+
+    # FIXME: use some other bus method than poll() with block=False
+    # https://lazka.github.io/pgi-docs/#Gst-1.0/classes/Bus.html
+    logger.debug('Starting play')
+    if block:
+        bus = playbin.get_bus()
+        try:
+            bus.poll(Gst.MessageType.EOS, Gst.CLOCK_TIME_NONE)
+        finally:
+            playbin.set_state(Gst.State.NULL)
+            
+    logger.debug('Finishing play')
+
+def _playsoundAnotherPython(otherPython, sound, block = True, macOS = False):
+    '''
+    Mostly written so that when this is run on python3 on macOS, it can invoke
+    python2 on macOS... but maybe this idea could be useful on linux, too.
+    '''
+    from inspect    import getsourcefile
+    from os.path    import abspath, exists
+    from subprocess import check_call
+    from threading  import Thread
+
+    sound = _canonicalizePath(sound)
+
+    class PropogatingThread(Thread):
+        def run(self):
+            self.exc = None
+            try:
+                self.ret = self._target(*self._args, **self._kwargs)
+            except BaseException as e:
+                self.exc = e
+
+        def join(self, timeout = None):
+            super().join(timeout)
+            if self.exc:
+                raise self.exc
+            return self.ret
+
+    # Check if the file exists...
+    if not exists(abspath(sound)):
+        raise PlaysoundException('Cannot find a sound with filename: ' + sound)
+
+    playsoundPath = abspath(getsourcefile(lambda: 0))
+    t = PropogatingThread(target = lambda: check_call([otherPython, playsoundPath, _handlePathOSX(sound) if macOS else sound]))
+    t.start()
+    if block:
+        t.join()
+
+from platform import system
+system = system()
+
+if system == 'Windows':
+    playsound_func = _playsoundWin
+elif system == 'Darwin':
+    playsound_func = _playsoundOSX
+    import sys
+    if sys.version_info[0] > 2:
+        try:
+            from AppKit import NSSound
+        except ImportError:
+            logger.warning("playsound is relying on a python 2 subprocess. Please use `pip3 install PyObjC` if you want playsound to run more efficiently.")
+            playsound_func = lambda sound, block = True: _playsoundAnotherPython('/System/Library/Frameworks/Python.framework/Versions/2.7/bin/python', sound, block, macOS = True)
+else:
+    playsound_func = _playsoundNix
+    if __name__ != '__main__':  # Ensure we don't infinitely recurse trying to get another python instance.
+        try:
+            import gi
+            gi.require_version('Gst', '1.0')
+            from gi.repository import Gst
+        except:
+            logger.warning("playsound is relying on another python subprocess. Please use `pip install pygobject` if you want playsound to run more efficiently.")
+            playsound_func = lambda sound, block = True: _playsoundAnotherPython('/usr/bin/python3', sound, block, macOS = False)
+
+del system
+
+def play(audio_filepath):
+    playsound_func(audio_filepath)
diff --git a/audio_separator/separator/uvr_lib_v5/pyrb.py b/audio_separator/separator/uvr_lib_v5/pyrb.py
new file mode 100644
index 0000000..883a525
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/pyrb.py
@@ -0,0 +1,92 @@
+import os
+import subprocess
+import tempfile
+import six
+import numpy as np
+import soundfile as sf
+import sys
+
+if getattr(sys, 'frozen', False):
+    BASE_PATH_RUB = sys._MEIPASS
+else:
+    BASE_PATH_RUB = os.path.dirname(os.path.abspath(__file__))
+
+__all__ = ['time_stretch', 'pitch_shift']
+
+__RUBBERBAND_UTIL = os.path.join(BASE_PATH_RUB, 'rubberband')
+
+if six.PY2:
+    DEVNULL = open(os.devnull, 'w')
+else:
+    DEVNULL = subprocess.DEVNULL
+
+def __rubberband(y, sr, **kwargs):
+
+    assert sr > 0
+
+    # Get the input and output tempfile
+    fd, infile = tempfile.mkstemp(suffix='.wav')
+    os.close(fd)
+    fd, outfile = tempfile.mkstemp(suffix='.wav')
+    os.close(fd)
+
+    # dump the audio
+    sf.write(infile, y, sr)
+
+    try:
+        # Execute rubberband
+        arguments = [__RUBBERBAND_UTIL, '-q']
+
+        for key, value in six.iteritems(kwargs):
+            arguments.append(str(key))
+            arguments.append(str(value))
+
+        arguments.extend([infile, outfile])
+
+        subprocess.check_call(arguments, stdout=DEVNULL, stderr=DEVNULL)
+
+        # Load the processed audio.
+        y_out, _ = sf.read(outfile, always_2d=True)
+
+        # make sure that output dimensions matches input
+        if y.ndim == 1:
+            y_out = np.squeeze(y_out)
+
+    except OSError as exc:
+        six.raise_from(RuntimeError('Failed to execute rubberband. '
+                                    'Please verify that rubberband-cli '
+                                    'is installed.'),
+                       exc)
+
+    finally:
+        # Remove temp files
+        os.unlink(infile)
+        os.unlink(outfile)
+
+    return y_out
+
+def time_stretch(y, sr, rate, rbargs=None):
+    if rate <= 0:
+        raise ValueError('rate must be strictly positive')
+
+    if rate == 1.0:
+        return y
+
+    if rbargs is None:
+        rbargs = dict()
+
+    rbargs.setdefault('--tempo', rate)
+
+    return __rubberband(y, sr, **rbargs)
+
+def pitch_shift(y, sr, n_steps, rbargs=None):
+
+    if n_steps == 0:
+        return y
+
+    if rbargs is None:
+        rbargs = dict()
+
+    rbargs.setdefault('--pitch', n_steps)
+
+    return __rubberband(y, sr, **rbargs)
diff --git a/audio_separator/separator/uvr_lib_v5/results.py b/audio_separator/separator/uvr_lib_v5/results.py
new file mode 100644
index 0000000..476f2d1
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/results.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+
+"""
+Matchering - Audio Matching and Mastering Python Library
+Copyright (C) 2016-2022 Sergree
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""
+
+import os
+import soundfile as sf
+
+
+class Result:
+    def __init__(
+        self, file: str, subtype: str, use_limiter: bool = True, normalize: bool = True
+    ):
+        _, file_ext = os.path.splitext(file)
+        file_ext = file_ext[1:].upper()
+        if not sf.check_format(file_ext):
+            raise TypeError(f"{file_ext} format is not supported")
+        if not sf.check_format(file_ext, subtype):
+            raise TypeError(f"{file_ext} format does not have {subtype} subtype")
+        self.file = file
+        self.subtype = subtype
+        self.use_limiter = use_limiter
+        self.normalize = normalize
+
+
+def pcm16(file: str) -> Result:
+    return Result(file, "PCM_16")
+
+def pcm24(file: str) -> Result:
+    return Result(file, "FLOAT")
+
+def save_audiofile(file: str, wav_set="PCM_16") -> Result:
+    return Result(file, wav_set)
diff --git a/audio_separator/separator/uvr_lib_v5/spec_utils.py b/audio_separator/separator/uvr_lib_v5/spec_utils.py
new file mode 100644
index 0000000..03f099d
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/spec_utils.py
@@ -0,0 +1,1309 @@
+import audioread
+import librosa
+import numpy as np
+import soundfile as sf
+import math
+import platform
+import traceback
+from scipy.signal import correlate, hilbert
+import io
+
+OPERATING_SYSTEM = platform.system()
+SYSTEM_ARCH = platform.platform()
+SYSTEM_PROC = platform.processor()
+ARM = "arm"
+
+AUTO_PHASE = "Automatic"
+POSITIVE_PHASE = "Positive Phase"
+NEGATIVE_PHASE = "Negative Phase"
+NONE_P = ("None",)
+LOW_P = ("Shifts: Low",)
+MED_P = ("Shifts: Medium",)
+HIGH_P = ("Shifts: High",)
+VHIGH_P = "Shifts: Very High"
+MAXIMUM_P = "Shifts: Maximum"
+
+progress_value = 0
+last_update_time = 0
+is_macos = False
+
+if OPERATING_SYSTEM == "Windows":
+    from pyrubberband import pyrb
+else:
+    from audio_separator.separator.uvr_lib_v5 import pyrb
+
+if OPERATING_SYSTEM == "Darwin":
+    wav_resolution = "polyphase" if SYSTEM_PROC == ARM or ARM in SYSTEM_ARCH else "sinc_fastest"
+    wav_resolution_float_resampling = "kaiser_best" if SYSTEM_PROC == ARM or ARM in SYSTEM_ARCH else wav_resolution
+    is_macos = True
+else:
+    wav_resolution = "sinc_fastest"
+    wav_resolution_float_resampling = wav_resolution
+
+MAX_SPEC = "Max Spec"
+MIN_SPEC = "Min Spec"
+LIN_ENSE = "Linear Ensemble"
+
+MAX_WAV = MAX_SPEC
+MIN_WAV = MIN_SPEC
+
+AVERAGE = "Average"
+
+
+def crop_center(h1, h2):
+    """
+    This function crops the center of the first input tensor to match the size of the second input tensor.
+    It is used to ensure that the two tensors have the same size in the time dimension.
+    """
+    h1_shape = h1.size()
+    h2_shape = h2.size()
+
+    # If the time dimensions are already equal, return the first tensor as is
+    if h1_shape[3] == h2_shape[3]:
+        return h1
+    # If the time dimension of the first tensor is smaller, raise an error
+    elif h1_shape[3] < h2_shape[3]:
+        raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
+
+    # Calculate the start and end indices for cropping
+    s_time = (h1_shape[3] - h2_shape[3]) // 2
+    e_time = s_time + h2_shape[3]
+    # Crop the first tensor
+    h1 = h1[:, :, :, s_time:e_time]
+
+    return h1
+
+
+def preprocess(X_spec):
+    """
+    This function preprocesses a spectrogram by separating it into magnitude and phase components.
+    This is a common preprocessing step in audio processing tasks.
+    """
+    X_mag = np.abs(X_spec)
+    X_phase = np.angle(X_spec)
+
+    return X_mag, X_phase
+
+
+def make_padding(width, cropsize, offset):
+    """
+    This function calculates the padding needed to make the width of an image divisible by the crop size.
+    It is used in the process of splitting an image into smaller patches.
+    """
+    left = offset
+    roi_size = cropsize - offset * 2
+    if roi_size == 0:
+        roi_size = cropsize
+    right = roi_size - (width % roi_size) + left
+
+    return left, right, roi_size
+
+
+def normalize(wave, max_peak=1.0):
+    """Normalize audio waveform to a specified peak value.
+
+    Args:
+        wave (array-like): Audio waveform.
+        max_peak (float): Maximum peak value for normalization.
+
+    Returns:
+        array-like: Normalized or original waveform.
+    """
+    maxv = np.abs(wave).max()
+    if maxv > max_peak:
+        wave *= max_peak / maxv
+
+    return wave
+
+
+def auto_transpose(audio_array: np.ndarray):
+    """
+    Ensure that the audio array is in the (channels, samples) format.
+
+    Parameters:
+        audio_array (ndarray): Input audio array.
+
+    Returns:
+        ndarray: Transposed audio array if necessary.
+    """
+
+    # If the second dimension is 2 (indicating stereo channels), transpose the array
+    if audio_array.shape[1] == 2:
+        return audio_array.T
+    return audio_array
+
+
+def write_array_to_mem(audio_data, subtype):
+    if isinstance(audio_data, np.ndarray):
+        audio_buffer = io.BytesIO()
+        sf.write(audio_buffer, audio_data, 44100, subtype=subtype, format="WAV")
+        audio_buffer.seek(0)
+        return audio_buffer
+    else:
+        return audio_data
+
+
+def spectrogram_to_image(spec, mode="magnitude"):
+    if mode == "magnitude":
+        if np.iscomplexobj(spec):
+            y = np.abs(spec)
+        else:
+            y = spec
+        y = np.log10(y**2 + 1e-8)
+    elif mode == "phase":
+        if np.iscomplexobj(spec):
+            y = np.angle(spec)
+        else:
+            y = spec
+
+    y -= y.min()
+    y *= 255 / y.max()
+    img = np.uint8(y)
+
+    if y.ndim == 3:
+        img = img.transpose(1, 2, 0)
+        img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2)
+
+    return img
+
+
+def reduce_vocal_aggressively(X, y, softmask):
+    v = X - y
+    y_mag_tmp = np.abs(y)
+    v_mag_tmp = np.abs(v)
+
+    v_mask = v_mag_tmp > y_mag_tmp
+    y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
+
+    return y_mag * np.exp(1.0j * np.angle(y))
+
+
+def merge_artifacts(y_mask, thres=0.01, min_range=64, fade_size=32):
+    mask = y_mask
+
+    try:
+        if min_range < fade_size * 2:
+            raise ValueError("min_range must be >= fade_size * 2")
+
+        idx = np.where(y_mask.min(axis=(0, 1)) > thres)[0]
+        start_idx = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
+        end_idx = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
+        artifact_idx = np.where(end_idx - start_idx > min_range)[0]
+        weight = np.zeros_like(y_mask)
+        if len(artifact_idx) > 0:
+            start_idx = start_idx[artifact_idx]
+            end_idx = end_idx[artifact_idx]
+            old_e = None
+            for s, e in zip(start_idx, end_idx):
+                if old_e is not None and s - old_e < fade_size:
+                    s = old_e - fade_size * 2
+
+                if s != 0:
+                    weight[:, :, s : s + fade_size] = np.linspace(0, 1, fade_size)
+                else:
+                    s -= fade_size
+
+                if e != y_mask.shape[2]:
+                    weight[:, :, e - fade_size : e] = np.linspace(1, 0, fade_size)
+                else:
+                    e += fade_size
+
+                weight[:, :, s + fade_size : e - fade_size] = 1
+                old_e = e
+
+        v_mask = 1 - y_mask
+        y_mask += weight * v_mask
+
+        mask = y_mask
+    except Exception as e:
+        error_name = f"{type(e).__name__}"
+        traceback_text = "".join(traceback.format_tb(e.__traceback__))
+        message = f'{error_name}: "{e}"\n{traceback_text}"'
+        print("Post Process Failed: ", message)
+
+    return mask
+
+
+def align_wave_head_and_tail(a, b):
+    l = min([a[0].size, b[0].size])
+
+    return a[:l, :l], b[:l, :l]
+
+
+def convert_channels(spec, mp, band):
+    cc = mp.param["band"][band].get("convert_channels")
+
+    if "mid_side_c" == cc:
+        spec_left = np.add(spec[0], spec[1] * 0.25)
+        spec_right = np.subtract(spec[1], spec[0] * 0.25)
+    elif "mid_side" == cc:
+        spec_left = np.add(spec[0], spec[1]) / 2
+        spec_right = np.subtract(spec[0], spec[1])
+    elif "stereo_n" == cc:
+        spec_left = np.add(spec[0], spec[1] * 0.25) / 0.9375
+        spec_right = np.add(spec[1], spec[0] * 0.25) / 0.9375
+    else:
+        return spec
+
+    return np.asfortranarray([spec_left, spec_right])
+
+
+def combine_spectrograms(specs, mp, is_v51_model=False):
+    l = min([specs[i].shape[2] for i in specs])
+    spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64)
+    offset = 0
+    bands_n = len(mp.param["band"])
+
+    for d in range(1, bands_n + 1):
+        h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"]
+        spec_c[:, offset : offset + h, :l] = specs[d][:, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l]
+        offset += h
+
+    if offset > mp.param["bins"]:
+        raise ValueError("Too much bins")
+
+    # lowpass fiter
+
+    if mp.param["pre_filter_start"] > 0:
+        if is_v51_model:
+            spec_c *= get_lp_filter_mask(spec_c.shape[1], mp.param["pre_filter_start"], mp.param["pre_filter_stop"])
+        else:
+            if bands_n == 1:
+                spec_c = fft_lp_filter(spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"])
+            else:
+                gp = 1
+                for b in range(mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]):
+                    g = math.pow(10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0)
+                    gp = g
+                    spec_c[:, b, :] *= g
+
+    return np.asfortranarray(spec_c)
+
+
+def wave_to_spectrogram(wave, hop_length, n_fft, mp, band, is_v51_model=False):
+
+    if wave.ndim == 1:
+        wave = np.asfortranarray([wave, wave])
+
+    if not is_v51_model:
+        if mp.param["reverse"]:
+            wave_left = np.flip(np.asfortranarray(wave[0]))
+            wave_right = np.flip(np.asfortranarray(wave[1]))
+        elif mp.param["mid_side"]:
+            wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
+            wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
+        elif mp.param["mid_side_b2"]:
+            wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
+            wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
+        else:
+            wave_left = np.asfortranarray(wave[0])
+            wave_right = np.asfortranarray(wave[1])
+    else:
+        wave_left = np.asfortranarray(wave[0])
+        wave_right = np.asfortranarray(wave[1])
+
+    spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length)
+    spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length)
+
+    spec = np.asfortranarray([spec_left, spec_right])
+
+    if is_v51_model:
+        spec = convert_channels(spec, mp, band)
+
+    return spec
+
+
+def spectrogram_to_wave(spec, hop_length=1024, mp={}, band=0, is_v51_model=True):
+    spec_left = np.asfortranarray(spec[0])
+    spec_right = np.asfortranarray(spec[1])
+
+    wave_left = librosa.istft(spec_left, hop_length=hop_length)
+    wave_right = librosa.istft(spec_right, hop_length=hop_length)
+
+    if is_v51_model:
+        cc = mp.param["band"][band].get("convert_channels")
+        if "mid_side_c" == cc:
+            return np.asfortranarray([np.subtract(wave_left / 1.0625, wave_right / 4.25), np.add(wave_right / 1.0625, wave_left / 4.25)])
+        elif "mid_side" == cc:
+            return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
+        elif "stereo_n" == cc:
+            return np.asfortranarray([np.subtract(wave_left, wave_right * 0.25), np.subtract(wave_right, wave_left * 0.25)])
+    else:
+        if mp.param["reverse"]:
+            return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
+        elif mp.param["mid_side"]:
+            return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
+        elif mp.param["mid_side_b2"]:
+            return np.asfortranarray([np.add(wave_right / 1.25, 0.4 * wave_left), np.subtract(wave_left / 1.25, 0.4 * wave_right)])
+
+    return np.asfortranarray([wave_left, wave_right])
+
+
+def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None, is_v51_model=False):
+    bands_n = len(mp.param["band"])
+    offset = 0
+
+    for d in range(1, bands_n + 1):
+        bp = mp.param["band"][d]
+        spec_s = np.ndarray(shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex)
+        h = bp["crop_stop"] - bp["crop_start"]
+        spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[:, offset : offset + h, :]
+
+        offset += h
+        if d == bands_n:  # higher
+            if extra_bins_h:  # if --high_end_process bypass
+                max_bin = bp["n_fft"] // 2
+                spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[:, :extra_bins_h, :]
+            if bp["hpf_start"] > 0:
+                if is_v51_model:
+                    spec_s *= get_hp_filter_mask(spec_s.shape[1], bp["hpf_start"], bp["hpf_stop"] - 1)
+                else:
+                    spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
+            if bands_n == 1:
+                wave = spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model)
+            else:
+                wave = np.add(wave, spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model))
+        else:
+            sr = mp.param["band"][d + 1]["sr"]
+            if d == 1:  # lower
+                if is_v51_model:
+                    spec_s *= get_lp_filter_mask(spec_s.shape[1], bp["lpf_start"], bp["lpf_stop"])
+                else:
+                    spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
+                wave = librosa.resample(spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model), orig_sr=bp["sr"], target_sr=sr, res_type=wav_resolution)
+            else:  # mid
+                if is_v51_model:
+                    spec_s *= get_hp_filter_mask(spec_s.shape[1], bp["hpf_start"], bp["hpf_stop"] - 1)
+                    spec_s *= get_lp_filter_mask(spec_s.shape[1], bp["lpf_start"], bp["lpf_stop"])
+                else:
+                    spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
+                    spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
+
+                wave2 = np.add(wave, spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model))
+                wave = librosa.resample(wave2, orig_sr=bp["sr"], target_sr=sr, res_type=wav_resolution)
+
+    return wave
+
+
+def get_lp_filter_mask(n_bins, bin_start, bin_stop):
+    mask = np.concatenate([np.ones((bin_start - 1, 1)), np.linspace(1, 0, bin_stop - bin_start + 1)[:, None], np.zeros((n_bins - bin_stop, 1))], axis=0)
+
+    return mask
+
+
+def get_hp_filter_mask(n_bins, bin_start, bin_stop):
+    mask = np.concatenate([np.zeros((bin_stop + 1, 1)), np.linspace(0, 1, 1 + bin_start - bin_stop)[:, None], np.ones((n_bins - bin_start - 2, 1))], axis=0)
+
+    return mask
+
+
+def fft_lp_filter(spec, bin_start, bin_stop):
+    g = 1.0
+    for b in range(bin_start, bin_stop):
+        g -= 1 / (bin_stop - bin_start)
+        spec[:, b, :] = g * spec[:, b, :]
+
+    spec[:, bin_stop:, :] *= 0
+
+    return spec
+
+
+def fft_hp_filter(spec, bin_start, bin_stop):
+    g = 1.0
+    for b in range(bin_start, bin_stop, -1):
+        g -= 1 / (bin_start - bin_stop)
+        spec[:, b, :] = g * spec[:, b, :]
+
+    spec[:, 0 : bin_stop + 1, :] *= 0
+
+    return spec
+
+
+def spectrogram_to_wave_old(spec, hop_length=1024):
+    if spec.ndim == 2:
+        wave = librosa.istft(spec, hop_length=hop_length)
+    elif spec.ndim == 3:
+        spec_left = np.asfortranarray(spec[0])
+        spec_right = np.asfortranarray(spec[1])
+
+        wave_left = librosa.istft(spec_left, hop_length=hop_length)
+        wave_right = librosa.istft(spec_right, hop_length=hop_length)
+        wave = np.asfortranarray([wave_left, wave_right])
+
+    return wave
+
+
+def wave_to_spectrogram_old(wave, hop_length, n_fft):
+    wave_left = np.asfortranarray(wave[0])
+    wave_right = np.asfortranarray(wave[1])
+
+    spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length)
+    spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length)
+
+    spec = np.asfortranarray([spec_left, spec_right])
+
+    return spec
+
+
+def mirroring(a, spec_m, input_high_end, mp):
+    if "mirroring" == a:
+        mirror = np.flip(np.abs(spec_m[:, mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :]), 1)
+        mirror = mirror * np.exp(1.0j * np.angle(input_high_end))
+
+        return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror)
+
+    if "mirroring2" == a:
+        mirror = np.flip(np.abs(spec_m[:, mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :]), 1)
+        mi = np.multiply(mirror, input_high_end * 1.7)
+
+        return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
+
+
+def adjust_aggr(mask, is_non_accom_stem, aggressiveness):
+    aggr = aggressiveness["value"] * 2
+
+    if aggr != 0:
+        if is_non_accom_stem:
+            aggr = 1 - aggr
+
+        aggr = [aggr, aggr]
+
+        if aggressiveness["aggr_correction"] is not None:
+            aggr[0] += aggressiveness["aggr_correction"]["left"]
+            aggr[1] += aggressiveness["aggr_correction"]["right"]
+
+        for ch in range(2):
+            mask[ch, : aggressiveness["split_bin"]] = np.power(mask[ch, : aggressiveness["split_bin"]], 1 + aggr[ch] / 3)
+            mask[ch, aggressiveness["split_bin"] :] = np.power(mask[ch, aggressiveness["split_bin"] :], 1 + aggr[ch])
+
+    return mask
+
+
+def stft(wave, nfft, hl):
+    wave_left = np.asfortranarray(wave[0])
+    wave_right = np.asfortranarray(wave[1])
+    spec_left = librosa.stft(wave_left, n_fft=nfft, hop_length=hl)
+    spec_right = librosa.stft(wave_right, n_fft=nfft, hop_length=hl)
+    spec = np.asfortranarray([spec_left, spec_right])
+
+    return spec
+
+
+def istft(spec, hl):
+    spec_left = np.asfortranarray(spec[0])
+    spec_right = np.asfortranarray(spec[1])
+    wave_left = librosa.istft(spec_left, hop_length=hl)
+    wave_right = librosa.istft(spec_right, hop_length=hl)
+    wave = np.asfortranarray([wave_left, wave_right])
+
+    return wave
+
+
+def spec_effects(wave, algorithm="Default", value=None):
+    spec = [stft(wave[0], 2048, 1024), stft(wave[1], 2048, 1024)]
+    if algorithm == "Min_Mag":
+        v_spec_m = np.where(np.abs(spec[1]) <= np.abs(spec[0]), spec[1], spec[0])
+        wave = istft(v_spec_m, 1024)
+    elif algorithm == "Max_Mag":
+        v_spec_m = np.where(np.abs(spec[1]) >= np.abs(spec[0]), spec[1], spec[0])
+        wave = istft(v_spec_m, 1024)
+    elif algorithm == "Default":
+        wave = (wave[1] * value) + (wave[0] * (1 - value))
+    elif algorithm == "Invert_p":
+        X_mag = np.abs(spec[0])
+        y_mag = np.abs(spec[1])
+        max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
+        v_spec = spec[1] - max_mag * np.exp(1.0j * np.angle(spec[0]))
+        wave = istft(v_spec, 1024)
+
+    return wave
+
+
+def spectrogram_to_wave_no_mp(spec, n_fft=2048, hop_length=1024):
+    wave = librosa.istft(spec, n_fft=n_fft, hop_length=hop_length)
+
+    if wave.ndim == 1:
+        wave = np.asfortranarray([wave, wave])
+
+    return wave
+
+
+def wave_to_spectrogram_no_mp(wave):
+
+    spec = librosa.stft(wave, n_fft=2048, hop_length=1024)
+
+    if spec.ndim == 1:
+        spec = np.asfortranarray([spec, spec])
+
+    return spec
+
+
+def invert_audio(specs, invert_p=True):
+
+    ln = min([specs[0].shape[2], specs[1].shape[2]])
+    specs[0] = specs[0][:, :, :ln]
+    specs[1] = specs[1][:, :, :ln]
+
+    if invert_p:
+        X_mag = np.abs(specs[0])
+        y_mag = np.abs(specs[1])
+        max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
+        v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0]))
+    else:
+        specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
+        v_spec = specs[0] - specs[1]
+
+    return v_spec
+
+
+def invert_stem(mixture, stem):
+    mixture = wave_to_spectrogram_no_mp(mixture)
+    stem = wave_to_spectrogram_no_mp(stem)
+    output = spectrogram_to_wave_no_mp(invert_audio([mixture, stem]))
+
+    return -output.T
+
+
+def ensembling(a, inputs, is_wavs=False):
+
+    for i in range(1, len(inputs)):
+        if i == 1:
+            input = inputs[0]
+
+        if is_wavs:
+            ln = min([input.shape[1], inputs[i].shape[1]])
+            input = input[:, :ln]
+            inputs[i] = inputs[i][:, :ln]
+        else:
+            ln = min([input.shape[2], inputs[i].shape[2]])
+            input = input[:, :, :ln]
+            inputs[i] = inputs[i][:, :, :ln]
+
+        if MIN_SPEC == a:
+            input = np.where(np.abs(inputs[i]) <= np.abs(input), inputs[i], input)
+        if MAX_SPEC == a:
+            input = np.where(np.abs(inputs[i]) >= np.abs(input), inputs[i], input)
+
+    # linear_ensemble
+    # input = ensemble_wav(inputs, split_size=1)
+
+    return input
+
+
+def ensemble_for_align(waves):
+
+    specs = []
+
+    for wav in waves:
+        spec = wave_to_spectrogram_no_mp(wav.T)
+        specs.append(spec)
+
+    wav_aligned = spectrogram_to_wave_no_mp(ensembling(MIN_SPEC, specs)).T
+    wav_aligned = match_array_shapes(wav_aligned, waves[1], is_swap=True)
+
+    return wav_aligned
+
+
+def ensemble_inputs(audio_input, algorithm, is_normalization, wav_type_set, save_path, is_wave=False, is_array=False):
+
+    wavs_ = []
+
+    if algorithm == AVERAGE:
+        output = average_audio(audio_input)
+        samplerate = 44100
+    else:
+        specs = []
+
+        for i in range(len(audio_input)):
+            wave, samplerate = librosa.load(audio_input[i], mono=False, sr=44100)
+            wavs_.append(wave)
+            spec = wave if is_wave else wave_to_spectrogram_no_mp(wave)
+            specs.append(spec)
+
+        wave_shapes = [w.shape[1] for w in wavs_]
+        target_shape = wavs_[wave_shapes.index(max(wave_shapes))]
+
+        if is_wave:
+            output = ensembling(algorithm, specs, is_wavs=True)
+        else:
+            output = spectrogram_to_wave_no_mp(ensembling(algorithm, specs))
+
+        output = to_shape(output, target_shape.shape)
+
+    sf.write(save_path, normalize(output.T, is_normalization), samplerate, subtype=wav_type_set)
+
+
+def to_shape(x, target_shape):
+    padding_list = []
+    for x_dim, target_dim in zip(x.shape, target_shape):
+        pad_value = target_dim - x_dim
+        pad_tuple = (0, pad_value)
+        padding_list.append(pad_tuple)
+
+    return np.pad(x, tuple(padding_list), mode="constant")
+
+
+def to_shape_minimize(x: np.ndarray, target_shape):
+
+    padding_list = []
+    for x_dim, target_dim in zip(x.shape, target_shape):
+        pad_value = target_dim - x_dim
+        pad_tuple = (0, pad_value)
+        padding_list.append(pad_tuple)
+
+    return np.pad(x, tuple(padding_list), mode="constant")
+
+
+def detect_leading_silence(audio, sr, silence_threshold=0.007, frame_length=1024):
+    """
+    Detect silence at the beginning of an audio signal.
+
+    :param audio: np.array, audio signal
+    :param sr: int, sample rate
+    :param silence_threshold: float, magnitude threshold below which is considered silence
+    :param frame_length: int, the number of samples to consider for each check
+
+    :return: float, duration of the leading silence in milliseconds
+    """
+
+    if len(audio.shape) == 2:
+        # If stereo, pick the channel with more energy to determine the silence
+        channel = np.argmax(np.sum(np.abs(audio), axis=1))
+        audio = audio[channel]
+
+    for i in range(0, len(audio), frame_length):
+        if np.max(np.abs(audio[i : i + frame_length])) > silence_threshold:
+            return (i / sr) * 1000
+
+    return (len(audio) / sr) * 1000
+
+
+def adjust_leading_silence(target_audio, reference_audio, silence_threshold=0.01, frame_length=1024):
+    """
+    Adjust the leading silence of the target_audio to match the leading silence of the reference_audio.
+
+    :param target_audio: np.array, audio signal that will have its silence adjusted
+    :param reference_audio: np.array, audio signal used as a reference
+    :param sr: int, sample rate
+    :param silence_threshold: float, magnitude threshold below which is considered silence
+    :param frame_length: int, the number of samples to consider for each check
+
+    :return: np.array, target_audio adjusted to have the same leading silence as reference_audio
+    """
+
+    def find_silence_end(audio):
+        if len(audio.shape) == 2:
+            # If stereo, pick the channel with more energy to determine the silence
+            channel = np.argmax(np.sum(np.abs(audio), axis=1))
+            audio_mono = audio[channel]
+        else:
+            audio_mono = audio
+
+        for i in range(0, len(audio_mono), frame_length):
+            if np.max(np.abs(audio_mono[i : i + frame_length])) > silence_threshold:
+                return i
+        return len(audio_mono)
+
+    ref_silence_end = find_silence_end(reference_audio)
+    target_silence_end = find_silence_end(target_audio)
+    silence_difference = ref_silence_end - target_silence_end
+
+    try:
+        ref_silence_end_p = (ref_silence_end / 44100) * 1000
+        target_silence_end_p = (target_silence_end / 44100) * 1000
+        silence_difference_p = ref_silence_end_p - target_silence_end_p
+        print("silence_difference: ", silence_difference_p)
+    except Exception as e:
+        pass
+
+    if silence_difference > 0:  # Add silence to target_audio
+        if len(target_audio.shape) == 2:  # stereo
+            silence_to_add = np.zeros((target_audio.shape[0], silence_difference))
+        else:  # mono
+            silence_to_add = np.zeros(silence_difference)
+        return np.hstack((silence_to_add, target_audio))
+    elif silence_difference < 0:  # Remove silence from target_audio
+        if len(target_audio.shape) == 2:  # stereo
+            return target_audio[:, -silence_difference:]
+        else:  # mono
+            return target_audio[-silence_difference:]
+    else:  # No adjustment needed
+        return target_audio
+
+
+def match_array_shapes(array_1: np.ndarray, array_2: np.ndarray, is_swap=False):
+
+    if is_swap:
+        array_1, array_2 = array_1.T, array_2.T
+
+    # print("before", array_1.shape, array_2.shape)
+    if array_1.shape[1] > array_2.shape[1]:
+        array_1 = array_1[:, : array_2.shape[1]]
+    elif array_1.shape[1] < array_2.shape[1]:
+        padding = array_2.shape[1] - array_1.shape[1]
+        array_1 = np.pad(array_1, ((0, 0), (0, padding)), "constant", constant_values=0)
+
+    # print("after", array_1.shape, array_2.shape)
+
+    if is_swap:
+        array_1, array_2 = array_1.T, array_2.T
+
+    return array_1
+
+
+def match_mono_array_shapes(array_1: np.ndarray, array_2: np.ndarray):
+
+    if len(array_1) > len(array_2):
+        array_1 = array_1[: len(array_2)]
+    elif len(array_1) < len(array_2):
+        padding = len(array_2) - len(array_1)
+        array_1 = np.pad(array_1, (0, padding), "constant", constant_values=0)
+
+    return array_1
+
+
+def change_pitch_semitones(y, sr, semitone_shift):
+    factor = 2 ** (semitone_shift / 12)  # Convert semitone shift to factor for resampling
+    y_pitch_tuned = []
+    for y_channel in y:
+        y_pitch_tuned.append(librosa.resample(y_channel, orig_sr=sr, target_sr=sr * factor, res_type=wav_resolution_float_resampling))
+    y_pitch_tuned = np.array(y_pitch_tuned)
+    new_sr = sr * factor
+    return y_pitch_tuned, new_sr
+
+
+def augment_audio(export_path, audio_file, rate, is_normalization, wav_type_set, save_format=None, is_pitch=False, is_time_correction=True):
+
+    wav, sr = librosa.load(audio_file, sr=44100, mono=False)
+
+    if wav.ndim == 1:
+        wav = np.asfortranarray([wav, wav])
+
+    if not is_time_correction:
+        wav_mix = change_pitch_semitones(wav, 44100, semitone_shift=-rate)[0]
+    else:
+        if is_pitch:
+            wav_1 = pyrb.pitch_shift(wav[0], sr, rate, rbargs=None)
+            wav_2 = pyrb.pitch_shift(wav[1], sr, rate, rbargs=None)
+        else:
+            wav_1 = pyrb.time_stretch(wav[0], sr, rate, rbargs=None)
+            wav_2 = pyrb.time_stretch(wav[1], sr, rate, rbargs=None)
+
+        if wav_1.shape > wav_2.shape:
+            wav_2 = to_shape(wav_2, wav_1.shape)
+        if wav_1.shape < wav_2.shape:
+            wav_1 = to_shape(wav_1, wav_2.shape)
+
+        wav_mix = np.asfortranarray([wav_1, wav_2])
+
+    sf.write(export_path, normalize(wav_mix.T, is_normalization), sr, subtype=wav_type_set)
+    save_format(export_path)
+
+
+def average_audio(audio):
+
+    waves = []
+    wave_shapes = []
+    final_waves = []
+
+    for i in range(len(audio)):
+        wave = librosa.load(audio[i], sr=44100, mono=False)
+        waves.append(wave[0])
+        wave_shapes.append(wave[0].shape[1])
+
+    wave_shapes_index = wave_shapes.index(max(wave_shapes))
+    target_shape = waves[wave_shapes_index]
+    waves.pop(wave_shapes_index)
+    final_waves.append(target_shape)
+
+    for n_array in waves:
+        wav_target = to_shape(n_array, target_shape.shape)
+        final_waves.append(wav_target)
+
+    waves = sum(final_waves)
+    waves = waves / len(audio)
+
+    return waves
+
+
+def average_dual_sources(wav_1, wav_2, value):
+
+    if wav_1.shape > wav_2.shape:
+        wav_2 = to_shape(wav_2, wav_1.shape)
+    if wav_1.shape < wav_2.shape:
+        wav_1 = to_shape(wav_1, wav_2.shape)
+
+    wave = (wav_1 * value) + (wav_2 * (1 - value))
+
+    return wave
+
+
+def reshape_sources(wav_1: np.ndarray, wav_2: np.ndarray):
+
+    if wav_1.shape > wav_2.shape:
+        wav_2 = to_shape(wav_2, wav_1.shape)
+    if wav_1.shape < wav_2.shape:
+        ln = min([wav_1.shape[1], wav_2.shape[1]])
+        wav_2 = wav_2[:, :ln]
+
+    ln = min([wav_1.shape[1], wav_2.shape[1]])
+    wav_1 = wav_1[:, :ln]
+    wav_2 = wav_2[:, :ln]
+
+    return wav_2
+
+
+def reshape_sources_ref(wav_1_shape, wav_2: np.ndarray):
+
+    if wav_1_shape > wav_2.shape:
+        wav_2 = to_shape(wav_2, wav_1_shape)
+
+    return wav_2
+
+
+def combine_arrarys(audio_sources, is_swap=False):
+    source = np.zeros_like(max(audio_sources, key=np.size))
+
+    for v in audio_sources:
+        v = match_array_shapes(v, source, is_swap=is_swap)
+        source += v
+
+    return source
+
+
+def combine_audio(paths: list, audio_file_base=None, wav_type_set="FLOAT", save_format=None):
+
+    source = combine_arrarys([load_audio(i) for i in paths])
+    save_path = f"{audio_file_base}_combined.wav"
+    sf.write(save_path, source.T, 44100, subtype=wav_type_set)
+    save_format(save_path)
+
+
+def reduce_mix_bv(inst_source, voc_source, reduction_rate=0.9):
+    # Reduce the volume
+    inst_source = inst_source * (1 - reduction_rate)
+
+    mix_reduced = combine_arrarys([inst_source, voc_source], is_swap=True)
+
+    return mix_reduced
+
+
+def organize_inputs(inputs):
+    input_list = {"target": None, "reference": None, "reverb": None, "inst": None}
+
+    for i in inputs:
+        if i.endswith("_(Vocals).wav"):
+            input_list["reference"] = i
+        elif "_RVC_" in i:
+            input_list["target"] = i
+        elif i.endswith("reverbed_stem.wav"):
+            input_list["reverb"] = i
+        elif i.endswith("_(Instrumental).wav"):
+            input_list["inst"] = i
+
+    return input_list
+
+
+def check_if_phase_inverted(wav1, wav2, is_mono=False):
+    # Load the audio files
+    if not is_mono:
+        wav1 = np.mean(wav1, axis=0)
+        wav2 = np.mean(wav2, axis=0)
+
+    # Compute the correlation
+    correlation = np.corrcoef(wav1[:1000], wav2[:1000])
+
+    return correlation[0, 1] < 0
+
+
+def align_audio(
+    file1,
+    file2,
+    file2_aligned,
+    file_subtracted,
+    wav_type_set,
+    is_save_aligned,
+    command_Text,
+    save_format,
+    align_window: list,
+    align_intro_val: list,
+    db_analysis: tuple,
+    set_progress_bar,
+    phase_option,
+    phase_shifts,
+    is_match_silence,
+    is_spec_match,
+):
+
+    global progress_value
+    progress_value = 0
+    is_mono = False
+
+    def get_diff(a, b):
+        corr = np.correlate(a, b, "full")
+        diff = corr.argmax() - (b.shape[0] - 1)
+
+        return diff
+
+    def progress_bar(length):
+        global progress_value
+        progress_value += 1
+
+        if (0.90 / length * progress_value) >= 0.9:
+            length = progress_value + 1
+
+        set_progress_bar(0.1, (0.9 / length * progress_value))
+
+    # read tracks
+
+    if file1.endswith(".mp3") and is_macos:
+        length1 = rerun_mp3(file1)
+        wav1, sr1 = librosa.load(file1, duration=length1, sr=44100, mono=False)
+    else:
+        wav1, sr1 = librosa.load(file1, sr=44100, mono=False)
+
+    if file2.endswith(".mp3") and is_macos:
+        length2 = rerun_mp3(file2)
+        wav2, sr2 = librosa.load(file2, duration=length2, sr=44100, mono=False)
+    else:
+        wav2, sr2 = librosa.load(file2, sr=44100, mono=False)
+
+    if wav1.ndim == 1 and wav2.ndim == 1:
+        is_mono = True
+    elif wav1.ndim == 1:
+        wav1 = np.asfortranarray([wav1, wav1])
+    elif wav2.ndim == 1:
+        wav2 = np.asfortranarray([wav2, wav2])
+
+    # Check if phase is inverted
+    if phase_option == AUTO_PHASE:
+        if check_if_phase_inverted(wav1, wav2, is_mono=is_mono):
+            wav2 = -wav2
+    elif phase_option == POSITIVE_PHASE:
+        wav2 = +wav2
+    elif phase_option == NEGATIVE_PHASE:
+        wav2 = -wav2
+
+    if is_match_silence:
+        wav2 = adjust_leading_silence(wav2, wav1)
+
+    wav1_length = int(librosa.get_duration(y=wav1, sr=44100))
+    wav2_length = int(librosa.get_duration(y=wav2, sr=44100))
+
+    if not is_mono:
+        wav1 = wav1.transpose()
+        wav2 = wav2.transpose()
+
+    wav2_org = wav2.copy()
+
+    command_Text("Processing files... \n")
+    seconds_length = min(wav1_length, wav2_length)
+
+    wav2_aligned_sources = []
+
+    for sec_len in align_intro_val:
+        # pick a position at 1 second in and get diff
+        sec_seg = 1 if sec_len == 1 else int(seconds_length // sec_len)
+        index = sr1 * sec_seg  # 1 second in, assuming sr1 = sr2 = 44100
+
+        if is_mono:
+            samp1, samp2 = wav1[index : index + sr1], wav2[index : index + sr1]
+            diff = get_diff(samp1, samp2)
+            # print(f"Estimated difference: {diff}\n")
+        else:
+            index = sr1 * sec_seg  # 1 second in, assuming sr1 = sr2 = 44100
+            samp1, samp2 = wav1[index : index + sr1, 0], wav2[index : index + sr1, 0]
+            samp1_r, samp2_r = wav1[index : index + sr1, 1], wav2[index : index + sr1, 1]
+            diff, diff_r = get_diff(samp1, samp2), get_diff(samp1_r, samp2_r)
+            # print(f"Estimated difference Left Channel: {diff}\nEstimated difference Right Channel: {diff_r}\n")
+
+        # make aligned track 2
+        if diff > 0:
+            zeros_to_append = np.zeros(diff) if is_mono else np.zeros((diff, 2))
+            wav2_aligned = np.append(zeros_to_append, wav2_org, axis=0)
+        elif diff < 0:
+            wav2_aligned = wav2_org[-diff:]
+        else:
+            wav2_aligned = wav2_org
+            # command_Text(f"Audio files already aligned.\n")
+
+        if not any(np.array_equal(wav2_aligned, source) for source in wav2_aligned_sources):
+            wav2_aligned_sources.append(wav2_aligned)
+
+    # print("Unique Sources: ", len(wav2_aligned_sources))
+
+    unique_sources = len(wav2_aligned_sources)
+
+    sub_mapper_big_mapper = {}
+
+    for s in wav2_aligned_sources:
+        wav2_aligned = match_mono_array_shapes(s, wav1) if is_mono else match_array_shapes(s, wav1, is_swap=True)
+
+        if align_window:
+            wav_sub = time_correction(
+                wav1, wav2_aligned, seconds_length, align_window=align_window, db_analysis=db_analysis, progress_bar=progress_bar, unique_sources=unique_sources, phase_shifts=phase_shifts
+            )
+            wav_sub_size = np.abs(wav_sub).mean()
+            sub_mapper_big_mapper = {**sub_mapper_big_mapper, **{wav_sub_size: wav_sub}}
+        else:
+            wav2_aligned = wav2_aligned * np.power(10, db_analysis[0] / 20)
+            db_range = db_analysis[1]
+
+            for db_adjustment in db_range:
+                # Adjust the dB of track2
+                s_adjusted = wav2_aligned * (10 ** (db_adjustment / 20))
+                wav_sub = wav1 - s_adjusted
+                wav_sub_size = np.abs(wav_sub).mean()
+                sub_mapper_big_mapper = {**sub_mapper_big_mapper, **{wav_sub_size: wav_sub}}
+
+        # print(sub_mapper_big_mapper.keys(), min(sub_mapper_big_mapper.keys()))
+
+    sub_mapper_value_list = list(sub_mapper_big_mapper.values())
+
+    if is_spec_match and len(sub_mapper_value_list) >= 2:
+        # print("using spec ensemble with align")
+        wav_sub = ensemble_for_align(list(sub_mapper_big_mapper.values()))
+    else:
+        # print("using linear ensemble with align")
+        wav_sub = ensemble_wav(list(sub_mapper_big_mapper.values()))
+
+    # print(f"Mix Mean: {np.abs(wav1).mean()}\nInst Mean: {np.abs(wav2).mean()}")
+    # print('Final: ', np.abs(wav_sub).mean())
+    wav_sub = np.clip(wav_sub, -1, +1)
+
+    command_Text(f"Saving inverted track... ")
+
+    if is_save_aligned or is_spec_match:
+        wav1 = match_mono_array_shapes(wav1, wav_sub) if is_mono else match_array_shapes(wav1, wav_sub, is_swap=True)
+        wav2_aligned = wav1 - wav_sub
+
+        if is_spec_match:
+            if wav1.ndim == 1 and wav2.ndim == 1:
+                wav2_aligned = np.asfortranarray([wav2_aligned, wav2_aligned]).T
+                wav1 = np.asfortranarray([wav1, wav1]).T
+
+            wav2_aligned = ensemble_for_align([wav2_aligned, wav1])
+            wav_sub = wav1 - wav2_aligned
+
+        if is_save_aligned:
+            sf.write(file2_aligned, wav2_aligned, sr1, subtype=wav_type_set)
+            save_format(file2_aligned)
+
+    sf.write(file_subtracted, wav_sub, sr1, subtype=wav_type_set)
+    save_format(file_subtracted)
+
+
+def phase_shift_hilbert(signal, degree):
+    analytic_signal = hilbert(signal)
+    return np.cos(np.radians(degree)) * analytic_signal.real - np.sin(np.radians(degree)) * analytic_signal.imag
+
+
+def get_phase_shifted_tracks(track, phase_shift):
+    if phase_shift == 180:
+        return [track, -track]
+
+    step = phase_shift
+    end = 180 - (180 % step) if 180 % step == 0 else 181
+    phase_range = range(step, end, step)
+
+    flipped_list = [track, -track]
+    for i in phase_range:
+        flipped_list.extend([phase_shift_hilbert(track, i), phase_shift_hilbert(track, -i)])
+
+    return flipped_list
+
+
+def time_correction(mix: np.ndarray, instrumental: np.ndarray, seconds_length, align_window, db_analysis, sr=44100, progress_bar=None, unique_sources=None, phase_shifts=NONE_P):
+    # Function to align two tracks using cross-correlation
+
+    def align_tracks(track1, track2):
+        # A dictionary to store each version of track2_shifted and its mean absolute value
+        shifted_tracks = {}
+
+        # Loop to adjust dB of track2
+        track2 = track2 * np.power(10, db_analysis[0] / 20)
+        db_range = db_analysis[1]
+
+        if phase_shifts == 190:
+            track2_flipped = [track2]
+        else:
+            track2_flipped = get_phase_shifted_tracks(track2, phase_shifts)
+
+        for db_adjustment in db_range:
+            for t in track2_flipped:
+                # Adjust the dB of track2
+                track2_adjusted = t * (10 ** (db_adjustment / 20))
+                corr = correlate(track1, track2_adjusted)
+                delay = np.argmax(np.abs(corr)) - (len(track1) - 1)
+                track2_shifted = np.roll(track2_adjusted, shift=delay)
+
+                # Compute the mean absolute value of track2_shifted
+                track2_shifted_sub = track1 - track2_shifted
+                mean_abs_value = np.abs(track2_shifted_sub).mean()
+
+                # Store track2_shifted and its mean absolute value in the dictionary
+                shifted_tracks[mean_abs_value] = track2_shifted
+
+        # Return the version of track2_shifted with the smallest mean absolute value
+
+        return shifted_tracks[min(shifted_tracks.keys())]
+
+    # Make sure the audio files have the same shape
+
+    assert mix.shape == instrumental.shape, f"Audio files must have the same shape - Mix: {mix.shape}, Inst: {instrumental.shape}"
+
+    seconds_length = seconds_length // 2
+
+    sub_mapper = {}
+
+    progress_update_interval = 120
+    total_iterations = 0
+
+    if len(align_window) > 2:
+        progress_update_interval = 320
+
+    for secs in align_window:
+        step = secs / 2
+        window_size = int(sr * secs)
+        step_size = int(sr * step)
+
+        if len(mix.shape) == 1:
+            total_mono = (len(range(0, len(mix) - window_size, step_size)) // progress_update_interval) * unique_sources
+            total_iterations += total_mono
+        else:
+            total_stereo_ = len(range(0, len(mix[:, 0]) - window_size, step_size)) * 2
+            total_stereo = (total_stereo_ // progress_update_interval) * unique_sources
+            total_iterations += total_stereo
+
+    # print(total_iterations)
+
+    for secs in align_window:
+        sub = np.zeros_like(mix)
+        divider = np.zeros_like(mix)
+        step = secs / 2
+        window_size = int(sr * secs)
+        step_size = int(sr * step)
+        window = np.hanning(window_size)
+
+        # For the mono case:
+        if len(mix.shape) == 1:
+            # The files are mono
+            counter = 0
+            for i in range(0, len(mix) - window_size, step_size):
+                counter += 1
+                if counter % progress_update_interval == 0:
+                    progress_bar(total_iterations)
+                window_mix = mix[i : i + window_size] * window
+                window_instrumental = instrumental[i : i + window_size] * window
+                window_instrumental_aligned = align_tracks(window_mix, window_instrumental)
+                sub[i : i + window_size] += window_mix - window_instrumental_aligned
+                divider[i : i + window_size] += window
+        else:
+            # The files are stereo
+            counter = 0
+            for ch in range(mix.shape[1]):
+                for i in range(0, len(mix[:, ch]) - window_size, step_size):
+                    counter += 1
+                    if counter % progress_update_interval == 0:
+                        progress_bar(total_iterations)
+                    window_mix = mix[i : i + window_size, ch] * window
+                    window_instrumental = instrumental[i : i + window_size, ch] * window
+                    window_instrumental_aligned = align_tracks(window_mix, window_instrumental)
+                    sub[i : i + window_size, ch] += window_mix - window_instrumental_aligned
+                    divider[i : i + window_size, ch] += window
+
+        # Normalize the result by the overlap count
+        sub = np.where(divider > 1e-6, sub / divider, sub)
+        sub_size = np.abs(sub).mean()
+        sub_mapper = {**sub_mapper, **{sub_size: sub}}
+
+    # print("SUB_LEN", len(list(sub_mapper.values())))
+
+    sub = ensemble_wav(list(sub_mapper.values()), split_size=12)
+
+    return sub
+
+
+def ensemble_wav(waveforms, split_size=240):
+    # Create a dictionary to hold the thirds of each waveform and their mean absolute values
+    waveform_thirds = {i: np.array_split(waveform, split_size) for i, waveform in enumerate(waveforms)}
+
+    # Initialize the final waveform
+    final_waveform = []
+
+    # For chunk
+    for third_idx in range(split_size):
+        # Compute the mean absolute value of each third from each waveform
+        means = [np.abs(waveform_thirds[i][third_idx]).mean() for i in range(len(waveforms))]
+
+        # Find the index of the waveform with the lowest mean absolute value for this third
+        min_index = np.argmin(means)
+
+        # Add the least noisy third to the final waveform
+        final_waveform.append(waveform_thirds[min_index][third_idx])
+
+    # Concatenate all the thirds to create the final waveform
+    final_waveform = np.concatenate(final_waveform)
+
+    return final_waveform
+
+
+def ensemble_wav_min(waveforms):
+    for i in range(1, len(waveforms)):
+        if i == 1:
+            wave = waveforms[0]
+
+        ln = min(len(wave), len(waveforms[i]))
+        wave = wave[:ln]
+        waveforms[i] = waveforms[i][:ln]
+
+        wave = np.where(np.abs(waveforms[i]) <= np.abs(wave), waveforms[i], wave)
+
+    return wave
+
+
+def align_audio_test(wav1, wav2, sr1=44100):
+    def get_diff(a, b):
+        corr = np.correlate(a, b, "full")
+        diff = corr.argmax() - (b.shape[0] - 1)
+        return diff
+
+    # read tracks
+    wav1 = wav1.transpose()
+    wav2 = wav2.transpose()
+
+    # print(f"Audio file shapes: {wav1.shape} / {wav2.shape}\n")
+
+    wav2_org = wav2.copy()
+
+    # pick a position at 1 second in and get diff
+    index = sr1  # *seconds_length  # 1 second in, assuming sr1 = sr2 = 44100
+    samp1 = wav1[index : index + sr1, 0]  # currently use left channel
+    samp2 = wav2[index : index + sr1, 0]
+    diff = get_diff(samp1, samp2)
+
+    # make aligned track 2
+    if diff > 0:
+        wav2_aligned = np.append(np.zeros((diff, 1)), wav2_org, axis=0)
+    elif diff < 0:
+        wav2_aligned = wav2_org[-diff:]
+    else:
+        wav2_aligned = wav2_org
+
+    return wav2_aligned
+
+
+def load_audio(audio_file):
+    wav, sr = librosa.load(audio_file, sr=44100, mono=False)
+
+    if wav.ndim == 1:
+        wav = np.asfortranarray([wav, wav])
+
+    return wav
+
+
+def rerun_mp3(audio_file):
+    with audioread.audio_open(audio_file) as f:
+        track_length = int(f.duration)
+
+    return track_length
diff --git a/audio_separator/separator/stft.py b/audio_separator/separator/uvr_lib_v5/stft.py
similarity index 93%
rename from audio_separator/separator/stft.py
rename to audio_separator/separator/uvr_lib_v5/stft.py
index c1dd2c3..f440395 100644
--- a/audio_separator/separator/stft.py
+++ b/audio_separator/separator/uvr_lib_v5/stft.py
@@ -1,10 +1,13 @@
 import torch
 
 
-# These functions perform the Short-Time Fourier Transform (stft) and its inverse (istft).
-# They are essential for converting the audio between the time domain and the frequency domain,
-# which is a crucial aspect of audio processing in neural networks.
 class STFT:
+    """
+    This class performs the Short-Time Fourier Transform (STFT) and its inverse (ISTFT).
+    These functions are essential for converting the audio between the time domain and the frequency domain,
+    which is a crucial aspect of audio processing in neural networks.
+    """
+
     def __init__(self, logger, n_fft, hop_length, dim_f, device):
         self.logger = logger
         self.n_fft = n_fft
@@ -35,9 +38,7 @@ def __call__(self, input_tensor):
         reshaped_tensor = input_tensor.reshape([-1, time_dim])
 
         # Perform the Short-Time Fourier Transform (STFT) on the reshaped tensor.
-        stft_output = torch.stft(
-            reshaped_tensor, n_fft=self.n_fft, hop_length=self.hop_length, window=stft_window, center=True, return_complex=False
-        )
+        stft_output = torch.stft(reshaped_tensor, n_fft=self.n_fft, hop_length=self.hop_length, window=stft_window, center=True, return_complex=False)
 
         # Rearrange the dimensions of the STFT output to bring the frequency dimension forward.
         permuted_stft_output = stft_output.permute([0, 3, 1, 2])
diff --git a/audio_separator/separator/uvr_lib_v5/tfc_tdf_v3.py b/audio_separator/separator/uvr_lib_v5/tfc_tdf_v3.py
new file mode 100644
index 0000000..eba006c
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/tfc_tdf_v3.py
@@ -0,0 +1,253 @@
+import torch
+import torch.nn as nn
+from functools import partial
+
+class STFT:
+    def __init__(self, n_fft, hop_length, dim_f, device):
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.window = torch.hann_window(window_length=self.n_fft, periodic=True)
+        self.dim_f = dim_f
+        self.device = device
+
+    def __call__(self, x):
+        
+        x_is_mps = not x.device.type in ["cuda", "cpu"]
+        if x_is_mps:
+            x = x.cpu()
+
+        window = self.window.to(x.device)
+        batch_dims = x.shape[:-2]
+        c, t = x.shape[-2:]
+        x = x.reshape([-1, t])
+        x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length, window=window, center=True,return_complex=False)
+        x = x.permute([0, 3, 1, 2])
+        x = x.reshape([*batch_dims, c, 2, -1, x.shape[-1]]).reshape([*batch_dims, c * 2, -1, x.shape[-1]])
+
+        if x_is_mps:
+            x = x.to(self.device)
+
+        return x[..., :self.dim_f, :]
+
+    def inverse(self, x):
+        
+        x_is_mps = not x.device.type in ["cuda", "cpu"]
+        if x_is_mps:
+            x = x.cpu()
+
+        window = self.window.to(x.device)
+        batch_dims = x.shape[:-3]
+        c, f, t = x.shape[-3:]
+        n = self.n_fft // 2 + 1
+        f_pad = torch.zeros([*batch_dims, c, n - f, t]).to(x.device)
+        x = torch.cat([x, f_pad], -2)
+        x = x.reshape([*batch_dims, c // 2, 2, n, t]).reshape([-1, 2, n, t])
+        x = x.permute([0, 2, 3, 1])
+        x = x[..., 0] + x[..., 1] * 1.j
+        x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop_length, window=window, center=True)
+        x = x.reshape([*batch_dims, 2, -1])
+
+        if x_is_mps:
+            x = x.to(self.device)
+
+        return x
+
+def get_norm(norm_type):
+    def norm(c, norm_type):
+        if norm_type == 'BatchNorm':
+            return nn.BatchNorm2d(c)
+        elif norm_type == 'InstanceNorm':
+            return nn.InstanceNorm2d(c, affine=True)
+        elif 'GroupNorm' in norm_type:
+            g = int(norm_type.replace('GroupNorm', ''))
+            return nn.GroupNorm(num_groups=g, num_channels=c)
+        else:
+            return nn.Identity()
+
+    return partial(norm, norm_type=norm_type)
+
+
+def get_act(act_type):
+    if act_type == 'gelu':
+        return nn.GELU()
+    elif act_type == 'relu':
+        return nn.ReLU()
+    elif act_type[:3] == 'elu':
+        alpha = float(act_type.replace('elu', ''))
+        return nn.ELU(alpha)
+    else:
+        raise Exception
+
+
+class Upscale(nn.Module):
+    def __init__(self, in_c, out_c, scale, norm, act):
+        super().__init__()
+        self.conv = nn.Sequential(
+            norm(in_c),
+            act,
+            nn.ConvTranspose2d(in_channels=in_c, out_channels=out_c, kernel_size=scale, stride=scale, bias=False)
+        )
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class Downscale(nn.Module):
+    def __init__(self, in_c, out_c, scale, norm, act):
+        super().__init__()
+        self.conv = nn.Sequential(
+            norm(in_c),
+            act,
+            nn.Conv2d(in_channels=in_c, out_channels=out_c, kernel_size=scale, stride=scale, bias=False)
+        )
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class TFC_TDF(nn.Module):
+    def __init__(self, in_c, c, l, f, bn, norm, act):
+        super().__init__()
+
+        self.blocks = nn.ModuleList()
+        for i in range(l):
+            block = nn.Module()
+
+            block.tfc1 = nn.Sequential(
+                norm(in_c),
+                act,
+                nn.Conv2d(in_c, c, 3, 1, 1, bias=False),
+            )
+            block.tdf = nn.Sequential(
+                norm(c),
+                act,
+                nn.Linear(f, f // bn, bias=False),
+                norm(c),
+                act,
+                nn.Linear(f // bn, f, bias=False),
+            )
+            block.tfc2 = nn.Sequential(
+                norm(c),
+                act,
+                nn.Conv2d(c, c, 3, 1, 1, bias=False),
+            )
+            block.shortcut = nn.Conv2d(in_c, c, 1, 1, 0, bias=False)
+
+            self.blocks.append(block)
+            in_c = c
+
+    def forward(self, x):
+        for block in self.blocks:
+            s = block.shortcut(x)
+            x = block.tfc1(x)
+            x = x + block.tdf(x)
+            x = block.tfc2(x)
+            x = x + s
+        return x
+
+
+class TFC_TDF_net(nn.Module):
+    def __init__(self, config, device):
+        super().__init__()
+        self.config = config
+        self.device = device
+
+        norm = get_norm(norm_type=config.model.norm)
+        act = get_act(act_type=config.model.act)
+
+        self.num_target_instruments = 1 if config.training.target_instrument else len(config.training.instruments)
+        self.num_subbands = config.model.num_subbands
+
+        dim_c = self.num_subbands * config.audio.num_channels * 2
+        n = config.model.num_scales
+        scale = config.model.scale
+        l = config.model.num_blocks_per_scale
+        c = config.model.num_channels
+        g = config.model.growth
+        bn = config.model.bottleneck_factor
+        f = config.audio.dim_f // self.num_subbands
+
+        self.first_conv = nn.Conv2d(dim_c, c, 1, 1, 0, bias=False)
+
+        self.encoder_blocks = nn.ModuleList()
+        for i in range(n):
+            block = nn.Module()
+            block.tfc_tdf = TFC_TDF(c, c, l, f, bn, norm, act)
+            block.downscale = Downscale(c, c + g, scale, norm, act)
+            f = f // scale[1]
+            c += g
+            self.encoder_blocks.append(block)
+
+        self.bottleneck_block = TFC_TDF(c, c, l, f, bn, norm, act)
+
+        self.decoder_blocks = nn.ModuleList()
+        for i in range(n):
+            block = nn.Module()
+            block.upscale = Upscale(c, c - g, scale, norm, act)
+            f = f * scale[1]
+            c -= g
+            block.tfc_tdf = TFC_TDF(2 * c, c, l, f, bn, norm, act)
+            self.decoder_blocks.append(block)
+
+        self.final_conv = nn.Sequential(
+            nn.Conv2d(c + dim_c, c, 1, 1, 0, bias=False),
+            act,
+            nn.Conv2d(c, self.num_target_instruments * dim_c, 1, 1, 0, bias=False)
+        )
+
+        self.stft = STFT(config.audio.n_fft, config.audio.hop_length, config.audio.dim_f, self.device)
+
+    def cac2cws(self, x):
+        k = self.num_subbands
+        b, c, f, t = x.shape
+        x = x.reshape(b, c, k, f // k, t)
+        x = x.reshape(b, c * k, f // k, t)
+        return x
+
+    def cws2cac(self, x):
+        k = self.num_subbands
+        b, c, f, t = x.shape
+        x = x.reshape(b, c // k, k, f, t)
+        x = x.reshape(b, c // k, f * k, t)
+        return x
+
+    def forward(self, x):
+
+        x = self.stft(x)
+
+        mix = x = self.cac2cws(x)
+
+        first_conv_out = x = self.first_conv(x)
+
+        x = x.transpose(-1, -2)
+
+        encoder_outputs = []
+        for block in self.encoder_blocks:
+            x = block.tfc_tdf(x)
+            encoder_outputs.append(x)
+            x = block.downscale(x)
+
+        x = self.bottleneck_block(x)
+
+        for block in self.decoder_blocks:
+            x = block.upscale(x)
+            x = torch.cat([x, encoder_outputs.pop()], 1)
+            x = block.tfc_tdf(x)
+
+        x = x.transpose(-1, -2)
+
+        x = x * first_conv_out  # reduce artifacts
+
+        x = self.final_conv(torch.cat([mix, x], 1))
+
+        x = self.cws2cac(x)
+
+        if self.num_target_instruments > 1:
+            b, c, f, t = x.shape
+            x = x.reshape(b, self.num_target_instruments, -1, f, t)
+
+        x = self.stft.inverse(x)
+
+        return x
+
+
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/__init__.py b/audio_separator/separator/uvr_lib_v5/vr_network/__init__.py
new file mode 100644
index 0000000..361b708
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/__init__.py
@@ -0,0 +1 @@
+# VR init.
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/layers.py b/audio_separator/separator/uvr_lib_v5/vr_network/layers.py
new file mode 100644
index 0000000..7526447
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/layers.py
@@ -0,0 +1,294 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from audio_separator.separator.uvr_lib_v5 import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+    """
+    This class implements a convolutional layer followed by batch normalization and an activation function.
+    It is a common pattern in deep learning for processing images or feature maps. The convolutional layer
+    applies a set of learnable filters to the input. Batch normalization then normalizes the output of the
+    convolution, and finally, an activation function introduces non-linearity to the model, allowing it to
+    learn more complex patterns.
+
+    Attributes:
+        conv (nn.Sequential): A sequential container of Conv2d, BatchNorm2d, and an activation layer.
+
+    Args:
+        num_input_channels (int): Number of input channels.
+        num_output_channels (int): Number of output channels.
+        kernel_size (int, optional): Size of the kernel. Defaults to 3.
+        stride_length (int, optional): Stride of the convolution. Defaults to 1.
+        padding_size (int, optional): Padding added to all sides of the input. Defaults to 1.
+        dilation_rate (int, optional): Spacing between kernel elements. Defaults to 1.
+        activation_function (callable, optional): The activation function to use. Defaults to nn.ReLU.
+    """
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+
+        # The nn.Sequential container allows us to stack the Conv2d, BatchNorm2d, and activation layers
+        # into a single module, simplifying the forward pass.
+        self.conv = nn.Sequential(nn.Conv2d(nin, nout, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, bias=False), nn.BatchNorm2d(nout), activ())
+
+    def __call__(self, input_tensor):
+        # Defines the computation performed at every call.
+        # Simply passes the input through the sequential container.
+        return self.conv(input_tensor)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+    """
+    This class implements a separable convolutional layer followed by batch normalization and an activation function.
+    Separable convolutions are a type of convolution that splits the convolution operation into two simpler operations:
+    a depthwise convolution and a pointwise convolution. This can reduce the number of parameters and computational cost,
+    making the network more efficient while maintaining similar performance.
+
+    The depthwise convolution applies a single filter per input channel (input depth). The pointwise convolution,
+    which follows, applies a 1x1 convolution to combine the outputs of the depthwise convolution across channels.
+    Batch normalization is then applied to stabilize learning and reduce internal covariate shift. Finally,
+    an activation function introduces non-linearity, allowing the network to learn complex patterns.
+    Attributes:
+        conv (nn.Sequential): A sequential container of depthwise Conv2d, pointwise Conv2d, BatchNorm2d, and an activation layer.
+
+    Args:
+        num_input_channels (int): Number of input channels.
+        num_output_channels (int): Number of output channels.
+        kernel_size (int, optional): Size of the kernel for the depthwise convolution. Defaults to 3.
+        stride_length (int, optional): Stride of the convolution. Defaults to 1.
+        padding_size (int, optional): Padding added to all sides of the input for the depthwise convolution. Defaults to 1.
+        dilation_rate (int, optional): Spacing between kernel elements for the depthwise convolution. Defaults to 1.
+        activation_function (callable, optional): The activation function to use. Defaults to nn.ReLU.
+    """
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(SeperableConv2DBNActiv, self).__init__()
+
+        # Initialize the sequential container with the depthwise convolution.
+        # The number of groups in the depthwise convolution is set to num_input_channels, which means each input channel is treated separately.
+        # The pointwise convolution then combines these separate channels into num_output_channels channels.
+        # Batch normalization is applied to the output of the pointwise convolution.
+        # Finally, the activation function is applied to introduce non-linearity.
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin,
+                nin,  # For depthwise convolution, in_channels = out_channels = num_input_channels
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                groups=nin,  # This makes it a depthwise convolution
+                bias=False,  # Bias is not used because it will be handled by BatchNorm2d
+            ),
+            nn.Conv2d(
+                nin,
+                nout,  # Pointwise convolution to combine channels
+                kernel_size=1,  # Kernel size of 1 for pointwise convolution
+                bias=False,  # Bias is not used because it will be handled by BatchNorm2d
+            ),
+            nn.BatchNorm2d(nout),  # Normalize the output of the pointwise convolution
+            activ(),  # Apply the activation function
+        )
+
+    def __call__(self, input_tensor):
+        # Pass the input through the sequential container.
+        # This performs the depthwise convolution, followed by the pointwise convolution,
+        # batch normalization, and finally applies the activation function.
+        return self.conv(input_tensor)
+
+
+class Encoder(nn.Module):
+    """
+    The Encoder class is a part of the neural network architecture that is responsible for processing the input data.
+    It consists of two convolutional layers, each followed by batch normalization and an activation function.
+    The purpose of the Encoder is to transform the input data into a higher-level, abstract representation.
+    This is achieved by applying filters (through convolutions) that can capture patterns or features in the data.
+    The Encoder can be thought of as a feature extractor that prepares the data for further processing by the network.
+    Attributes:
+        conv1 (Conv2DBNActiv): The first convolutional layer in the encoder.
+        conv2 (Conv2DBNActiv): The second convolutional layer in the encoder.
+
+    Args:
+        number_of_input_channels (int): Number of input channels for the first convolutional layer.
+        number_of_output_channels (int): Number of output channels for the convolutional layers.
+        kernel_size (int): Kernel size for the convolutional layers.
+        stride_length (int): Stride for the convolutional operations.
+        padding_size (int): Padding added to all sides of the input for the convolutional layers.
+        activation_function (callable): The activation function to use after each convolutional layer.
+    """
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+        super(Encoder, self).__init__()
+
+        # The first convolutional layer takes the input and applies a convolution,
+        # followed by batch normalization and an activation function specified by `activation_function`.
+        # This layer is responsible for capturing the initial set of features from the input data.
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+
+        # The second convolutional layer further processes the output from the first layer,
+        # applying another set of convolution, batch normalization, and activation.
+        # This layer helps in capturing more complex patterns in the data by building upon the initial features extracted by conv1.
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+    def __call__(self, input_tensor):
+        # The input data `input_tensor` is passed through the first convolutional layer.
+        # The output of this layer serves as a 'skip connection' that can be used later in the network to preserve spatial information.
+        skip = self.conv1(input_tensor)
+
+        # The output from the first layer is then passed through the second convolutional layer.
+        # This processed data `hidden` is the final output of the Encoder, representing the abstracted features of the input.
+        hidden = self.conv2(skip)
+
+        # The Encoder returns two outputs: `hidden`, the abstracted feature representation, and `skip`, the intermediate representation from conv1.
+        return hidden, skip
+
+
+class Decoder(nn.Module):
+    """
+    The Decoder class is part of the neural network architecture, specifically designed to perform the inverse operation of an encoder.
+    Its main role is to reconstruct or generate data from encoded representations, which is crucial in tasks like image segmentation or audio processing.
+    This class uses upsampling, convolution, optional dropout for regularization, and concatenation of skip connections to achieve its goal.
+
+    Attributes:
+        convolution (Conv2DBNActiv): A convolutional layer with batch normalization and activation function.
+        dropout_layer (nn.Dropout2d): An optional dropout layer for regularization to prevent overfitting.
+
+    Args:
+        input_channels (int): Number of input channels for the convolutional layer.
+        output_channels (int): Number of output channels for the convolutional layer.
+        kernel_size (int): Kernel size for the convolutional layer.
+        stride (int): Stride for the convolutional operations.
+        padding (int): Padding added to all sides of the input for the convolutional layer.
+        activation_function (callable): The activation function to use after the convolutional layer.
+        include_dropout (bool): Whether to include a dropout layer for regularization.
+    """
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
+        super(Decoder, self).__init__()
+
+        # Initialize the convolutional layer with specified parameters.
+        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+
+        # Initialize the dropout layer if include_dropout is set to True
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+    def __call__(self, input_tensor, skip=None):
+        # Upsample the input tensor to a higher resolution using bilinear interpolation.
+        input_tensor = F.interpolate(input_tensor, scale_factor=2, mode="bilinear", align_corners=True)
+        # If a skip connection is provided, crop it to match the size of input_tensor and concatenate them along the channel dimension.
+        if skip is not None:
+            skip = spec_utils.crop_center(skip, input_tensor)  # Crop skip_connection to match input_tensor's dimensions.
+            input_tensor = torch.cat([input_tensor, skip], dim=1)  # Concatenate input_tensor and skip_connection along the channel dimension.
+
+        # Pass the concatenated tensor (or just input_tensor if no skip_connection is provided) through the convolutional layer.
+        output_tensor = self.conv(input_tensor)
+
+        # If dropout is enabled, apply it to the output of the convolutional layer.
+        if self.dropout is not None:
+            output_tensor = self.dropout(output_tensor)
+
+        # Return the final output tensor.
+        return output_tensor
+
+
+class ASPPModule(nn.Module):
+    """
+    Atrous Spatial Pyramid Pooling (ASPP) Module is designed for capturing multi-scale context by applying
+    atrous convolution at multiple rates. This is particularly useful in segmentation tasks where capturing
+    objects at various scales is beneficial. The module applies several parallel dilated convolutions with
+    different dilation rates to the input feature map, allowing it to efficiently capture information at
+    multiple scales.
+
+    Attributes:
+        conv1 (nn.Sequential): Applies adaptive average pooling followed by a 1x1 convolution.
+        nn_architecture (int): Identifier for the neural network architecture being used.
+        six_layer (list): List containing architecture identifiers that require six layers.
+        seven_layer (list): List containing architecture identifiers that require seven layers.
+        conv2-conv7 (nn.Module): Convolutional layers with varying dilation rates for multi-scale feature extraction.
+        bottleneck (nn.Sequential): A 1x1 convolutional layer that combines all features followed by dropout for regularization.
+    """
+
+    def __init__(self, nn_architecture, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
+        """
+        Initializes the ASPP module with specified parameters.
+
+        Args:
+            nn_architecture (int): Identifier for the neural network architecture.
+            input_channels (int): Number of input channels.
+            output_channels (int): Number of output channels.
+            dilations (tuple): Tuple of dilation rates for the atrous convolutions.
+            activation (callable): Activation function to use after convolutional layers.
+        """
+        super(ASPPModule, self).__init__()
+
+        # Adaptive average pooling reduces the spatial dimensions to 1x1, focusing on global context,
+        # followed by a 1x1 convolution to project back to the desired channel dimension.
+        self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, None)), Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ))
+
+        self.nn_architecture = nn_architecture
+        # Architecture identifiers for models requiring additional layers.
+        self.six_layer = [129605]
+        self.seven_layer = [537238, 537227, 33966]
+
+        # Extra convolutional layer used for six and seven layer configurations.
+        extra_conv = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+
+        # Standard 1x1 convolution for channel reduction.
+        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+
+        # Separable convolutions with different dilation rates for multi-scale feature extraction.
+        self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
+        self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
+        self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+
+        # Depending on the architecture, include the extra convolutional layers.
+        if self.nn_architecture in self.six_layer:
+            self.conv6 = extra_conv
+            nin_x = 6
+        elif self.nn_architecture in self.seven_layer:
+            self.conv6 = extra_conv
+            self.conv7 = extra_conv
+            nin_x = 7
+        else:
+            nin_x = 5
+
+        # Bottleneck layer combines all the multi-scale features into the desired number of output channels.
+        self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * nin_x, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
+
+    def forward(self, input_tensor):
+        """
+        Forward pass of the ASPP module.
+
+        Args:
+            input_tensor (Tensor): Input tensor.
+
+        Returns:
+            Tensor: Output tensor after applying ASPP.
+        """
+        _, _, h, w = input_tensor.size()
+
+        # Apply the first convolutional sequence and upsample to the original resolution.
+        feat1 = F.interpolate(self.conv1(input_tensor), size=(h, w), mode="bilinear", align_corners=True)
+
+        # Apply the remaining convolutions directly on the input.
+        feat2 = self.conv2(input_tensor)
+        feat3 = self.conv3(input_tensor)
+        feat4 = self.conv4(input_tensor)
+        feat5 = self.conv5(input_tensor)
+
+        # Concatenate features from all layers. Depending on the architecture, include the extra features.
+        if self.nn_architecture in self.six_layer:
+            feat6 = self.conv6(input_tensor)
+            out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6), dim=1)
+        elif self.nn_architecture in self.seven_layer:
+            feat6 = self.conv6(input_tensor)
+            feat7 = self.conv7(input_tensor)
+            out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
+        else:
+            out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+
+        # Apply the bottleneck layer to combine and reduce the channel dimensions.
+        bottleneck_output = self.bottleneck(out)
+        return bottleneck_output
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/layers_new.py b/audio_separator/separator/uvr_lib_v5/vr_network/layers_new.py
new file mode 100644
index 0000000..56b7d45
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/layers_new.py
@@ -0,0 +1,149 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from audio_separator.separator.uvr_lib_v5 import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+    """
+    Conv2DBNActiv Class:
+    This class implements a convolutional layer followed by batch normalization and an activation function.
+    It is a fundamental building block for constructing neural networks, especially useful in image and audio processing tasks.
+    The class encapsulates the pattern of applying a convolution, normalizing the output, and then applying a non-linear activation.
+    """
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+
+        # Sequential model combining Conv2D, BatchNorm, and activation function into a single module
+        self.conv = nn.Sequential(nn.Conv2d(nin, nout, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, bias=False), nn.BatchNorm2d(nout), activ())
+
+    def __call__(self, input_tensor):
+        # Forward pass through the sequential model
+        return self.conv(input_tensor)
+
+
+class Encoder(nn.Module):
+    """
+    Encoder Class:
+    This class defines an encoder module typically used in autoencoder architectures.
+    It consists of two convolutional layers, each followed by batch normalization and an activation function.
+    """
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+        super(Encoder, self).__init__()
+
+        # First convolutional layer of the encoder
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
+        # Second convolutional layer of the encoder
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
+
+    def __call__(self, input_tensor):
+        # Applying the first and then the second convolutional layers
+        hidden = self.conv1(input_tensor)
+        hidden = self.conv2(hidden)
+
+        return hidden
+
+
+class Decoder(nn.Module):
+    """
+    Decoder Class:
+    This class defines a decoder module, which is the counterpart of the Encoder class in autoencoder architectures.
+    It applies a convolutional layer followed by batch normalization and an activation function, with an optional dropout layer for regularization.
+    """
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
+        super(Decoder, self).__init__()
+        # Convolutional layer with optional dropout for regularization
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+    def __call__(self, input_tensor, skip=None):
+        # Forward pass through the convolutional layer and optional dropout
+        input_tensor = F.interpolate(input_tensor, scale_factor=2, mode="bilinear", align_corners=True)
+
+        if skip is not None:
+            skip = spec_utils.crop_center(skip, input_tensor)
+            input_tensor = torch.cat([input_tensor, skip], dim=1)
+
+        hidden = self.conv1(input_tensor)
+        # hidden = self.conv2(hidden)
+
+        if self.dropout is not None:
+            hidden = self.dropout(hidden)
+
+        return hidden
+
+
+class ASPPModule(nn.Module):
+    """
+    ASPPModule Class:
+    This class implements the Atrous Spatial Pyramid Pooling (ASPP) module, which is useful for semantic image segmentation tasks.
+    It captures multi-scale contextual information by applying convolutions at multiple dilation rates.
+    """
+
+    def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
+        super(ASPPModule, self).__init__()
+
+        # Global context convolution captures the overall context
+        self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, None)), Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ))
+        self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
+        self.conv3 = Conv2DBNActiv(nin, nout, 3, 1, dilations[0], dilations[0], activ=activ)
+        self.conv4 = Conv2DBNActiv(nin, nout, 3, 1, dilations[1], dilations[1], activ=activ)
+        self.conv5 = Conv2DBNActiv(nin, nout, 3, 1, dilations[2], dilations[2], activ=activ)
+        self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+    def forward(self, input_tensor):
+        _, _, h, w = input_tensor.size()
+
+        # Upsample global context to match input size and combine with local and multi-scale features
+        feat1 = F.interpolate(self.conv1(input_tensor), size=(h, w), mode="bilinear", align_corners=True)
+        feat2 = self.conv2(input_tensor)
+        feat3 = self.conv3(input_tensor)
+        feat4 = self.conv4(input_tensor)
+        feat5 = self.conv5(input_tensor)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+        out = self.bottleneck(out)
+
+        if self.dropout is not None:
+            out = self.dropout(out)
+
+        return out
+
+
+class LSTMModule(nn.Module):
+    """
+    LSTMModule Class:
+    This class defines a module that combines convolutional feature extraction with a bidirectional LSTM for sequence modeling.
+    It is useful for tasks that require understanding temporal dynamics in data, such as speech and audio processing.
+    """
+
+    def __init__(self, nin_conv, nin_lstm, nout_lstm):
+        super(LSTMModule, self).__init__()
+        # Convolutional layer for initial feature extraction
+        self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
+
+        # Bidirectional LSTM for capturing temporal dynamics
+        self.lstm = nn.LSTM(input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True)
+
+        # Dense layer for output dimensionality matching
+        self.dense = nn.Sequential(nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU())
+
+    def forward(self, input_tensor):
+        N, _, nbins, nframes = input_tensor.size()
+
+        # Extract features and prepare for LSTM
+        hidden = self.conv(input_tensor)[:, 0]  # N, nbins, nframes
+        hidden = hidden.permute(2, 0, 1)  # nframes, N, nbins
+        h, _ = self.lstm(h)
+
+        # Apply dense layer and reshape to match expected output format
+        hidden = self.dense(h.reshape(-1, hidden.size()[-1]))  # nframes * N, nbins
+        hidden = hidden.reshape(nframes, N, 1, nbins)
+        hidden = hidden.permute(1, 2, 3, 0)
+
+        return hidden
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/model_param_init.py b/audio_separator/separator/uvr_lib_v5/vr_network/model_param_init.py
new file mode 100644
index 0000000..8bba702
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/model_param_init.py
@@ -0,0 +1,71 @@
+import json
+
+default_param = {}
+default_param["bins"] = -1
+default_param["unstable_bins"] = -1  # training only
+default_param["stable_bins"] = -1  # training only
+default_param["sr"] = 44100
+default_param["pre_filter_start"] = -1
+default_param["pre_filter_stop"] = -1
+default_param["band"] = {}
+
+N_BINS = "n_bins"
+
+
+def int_keys(d):
+    """
+    Converts string keys that represent integers into actual integer keys in a list.
+
+    This function is particularly useful when dealing with JSON data that may represent
+    integer keys as strings due to the nature of JSON encoding. By converting these keys
+    back to integers, it ensures that the data can be used in a manner consistent with
+    its original representation, especially in contexts where the distinction between
+    string and integer keys is important.
+
+    Args:
+        input_list (list of tuples): A list of (key, value) pairs where keys are strings
+                                     that may represent integers.
+
+    Returns:
+        dict: A dictionary with keys converted to integers where applicable.
+    """
+    # Initialize an empty dictionary to hold the converted key-value pairs.
+    result_dict = {}
+    # Iterate through each key-value pair in the input list.
+    for key, value in d:
+        # Check if the key is a digit (i.e., represents an integer).
+        if key.isdigit():
+            # Convert the key from a string to an integer.
+            key = int(key)
+        result_dict[key] = value
+    return result_dict
+
+
+class ModelParameters(object):
+    """
+    A class to manage model parameters, including loading from a configuration file.
+
+    Attributes:
+        param (dict): Dictionary holding all parameters for the model.
+    """
+
+    def __init__(self, config_path=""):
+        """
+        Initializes the ModelParameters object by loading parameters from a JSON configuration file.
+
+        Args:
+            config_path (str): Path to the JSON configuration file.
+        """
+
+        # Load parameters from the given configuration file path.
+        with open(config_path, "r") as f:
+            self.param = json.loads(f.read(), object_pairs_hook=int_keys)
+
+        # Ensure certain parameters are set to False if not specified in the configuration.
+        for k in ["mid_side", "mid_side_b", "mid_side_b2", "stereo_w", "stereo_n", "reverse"]:
+            if not k in self.param:
+                self.param[k] = False
+
+        # If 'n_bins' is specified in the parameters, it's used as the value for 'bins'.
+        if N_BINS in self.param:
+            self.param["bins"] = self.param[N_BINS]
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr16000_hl512.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr16000_hl512.json
new file mode 100644
index 0000000..72cb449
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr16000_hl512.json
@@ -0,0 +1,19 @@
+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 16000,
+			"hl": 512,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 1024,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 16000,
+	"pre_filter_start": 1023,
+	"pre_filter_stop": 1024
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr32000_hl512.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr32000_hl512.json
new file mode 100644
index 0000000..3c00ecf
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr32000_hl512.json
@@ -0,0 +1,19 @@
+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 32000,
+			"hl": 512,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 1024,
+			"hpf_start": -1,
+			"res_type": "kaiser_fast"
+		}
+	},
+	"sr": 32000,
+	"pre_filter_start": 1000,
+	"pre_filter_stop": 1021
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr33075_hl384.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr33075_hl384.json
new file mode 100644
index 0000000..55666ac
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr33075_hl384.json
@@ -0,0 +1,19 @@
+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 33075,
+			"hl": 384,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 1024,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 33075,
+	"pre_filter_start": 1000,
+	"pre_filter_stop": 1021
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl1024.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl1024.json
new file mode 100644
index 0000000..665abe2
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl1024.json
@@ -0,0 +1,19 @@
+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 44100,
+			"hl": 1024,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 1024,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 1023,
+	"pre_filter_stop": 1024
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl256.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl256.json
new file mode 100644
index 0000000..0e8b16f
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl256.json
@@ -0,0 +1,19 @@
+{
+	"bins": 256,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 44100,
+			"hl": 256,
+			"n_fft": 512,
+			"crop_start": 0,
+			"crop_stop": 256,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 256,
+	"pre_filter_stop": 256
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512.json
new file mode 100644
index 0000000..3b38fca
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512.json
@@ -0,0 +1,19 @@
+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 1024,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 1023,
+	"pre_filter_stop": 1024
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512_cut.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512_cut.json
new file mode 100644
index 0000000..630df35
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512_cut.json
@@ -0,0 +1,19 @@
+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 700,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 1023,
+	"pre_filter_stop": 700
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512_nf1024.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512_nf1024.json
new file mode 100644
index 0000000..120ef1a
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512_nf1024.json
@@ -0,0 +1,19 @@
+{
+	"bins": 512,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 1024,
+			"crop_start": 0,
+			"crop_stop": 512,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 511,
+	"pre_filter_stop": 512
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_32000.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_32000.json
new file mode 100644
index 0000000..ab9cf11
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_32000.json
@@ -0,0 +1,30 @@
+{
+	"bins": 768,
+	"unstable_bins": 7,
+	"reduction_bins": 705,
+	"band": {
+		"1": {
+			"sr": 6000,
+			"hl": 66,
+			"n_fft": 512,
+			"crop_start": 0,
+			"crop_stop": 240,
+			"lpf_start": 60,
+			"lpf_stop": 118,
+			"res_type": "sinc_fastest"
+		},
+		"2": {
+			"sr": 32000,
+			"hl": 352,
+			"n_fft": 1024,
+			"crop_start": 22,
+			"crop_stop": 505,
+			"hpf_start": 44,
+			"hpf_stop": 23,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 32000,
+	"pre_filter_start": 710,
+	"pre_filter_stop": 731
+}
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_44100_lofi.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_44100_lofi.json
new file mode 100644
index 0000000..7faa216
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_44100_lofi.json
@@ -0,0 +1,30 @@
+{
+	"bins": 512,
+	"unstable_bins": 7,
+	"reduction_bins": 510,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 160,
+			"n_fft": 768,
+			"crop_start": 0,
+			"crop_stop": 192,
+			"lpf_start": 41,
+			"lpf_stop": 139,
+			"res_type": "sinc_fastest"
+		},
+		"2": {
+			"sr": 44100,
+			"hl": 640,
+			"n_fft": 1024,
+			"crop_start": 10,
+			"crop_stop": 320,
+			"hpf_start": 47,
+			"hpf_stop": 15,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 510,
+	"pre_filter_stop": 512
+}
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_48000.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_48000.json
new file mode 100644
index 0000000..be075f5
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_48000.json
@@ -0,0 +1,30 @@
+{
+	"bins": 768,
+	"unstable_bins": 7,
+	"reduction_bins": 705,
+	"band": {
+		"1": {
+			"sr": 6000,
+			"hl": 66,
+			"n_fft": 512,
+			"crop_start": 0,
+			"crop_stop": 240,
+			"lpf_start": 60,
+			"lpf_stop": 240,
+			"res_type": "sinc_fastest"
+		},
+		"2": {
+			"sr": 48000,
+			"hl": 528,
+			"n_fft": 1536,
+			"crop_start": 22,
+			"crop_stop": 505,
+			"hpf_start": 82,
+			"hpf_stop": 22,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 48000,
+	"pre_filter_start": 710,
+	"pre_filter_stop": 731
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100.json
new file mode 100644
index 0000000..d99e239
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100.json
@@ -0,0 +1,42 @@
+{
+	"bins": 768,
+	"unstable_bins": 5,
+	"reduction_bins": 733,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 768,
+			"crop_start": 0,
+			"crop_stop": 278,
+			"lpf_start": 28,
+			"lpf_stop": 140,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 22050,
+			"hl": 256,
+			"n_fft": 768,
+			"crop_start": 14,
+			"crop_stop": 322,
+			"hpf_start": 70,
+			"hpf_stop": 14,
+			"lpf_start": 283,
+			"lpf_stop": 314,
+			"res_type": "polyphase"
+		},	
+		"3": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 768,
+			"crop_start": 131,
+			"crop_stop": 313,
+			"hpf_start": 154,
+			"hpf_stop": 141,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 757,
+	"pre_filter_stop": 768
+}
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100_mid.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100_mid.json
new file mode 100644
index 0000000..fc2c487
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100_mid.json
@@ -0,0 +1,43 @@
+{
+	"mid_side": true,
+	"bins": 768,
+	"unstable_bins": 5,
+	"reduction_bins": 733,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 768,
+			"crop_start": 0,
+			"crop_stop": 278,
+			"lpf_start": 28,
+			"lpf_stop": 140,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 22050,
+			"hl": 256,
+			"n_fft": 768,
+			"crop_start": 14,
+			"crop_stop": 322,
+			"hpf_start": 70,
+			"hpf_stop": 14,
+			"lpf_start": 283,
+			"lpf_stop": 314,
+			"res_type": "polyphase"
+		},	
+		"3": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 768,
+			"crop_start": 131,
+			"crop_stop": 313,
+			"hpf_start": 154,
+			"hpf_stop": 141,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 757,
+	"pre_filter_stop": 768
+}
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100_msb2.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100_msb2.json
new file mode 100644
index 0000000..33b0877
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100_msb2.json
@@ -0,0 +1,43 @@
+{
+	"mid_side_b2": true,
+	"bins": 640,
+	"unstable_bins": 7,
+	"reduction_bins": 565,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 108,
+			"n_fft": 1024,
+			"crop_start": 0,
+			"crop_stop": 187,
+			"lpf_start": 92,
+			"lpf_stop": 186,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 22050,
+			"hl": 216,
+			"n_fft": 768,
+			"crop_start": 0,
+			"crop_stop": 212,
+			"hpf_start": 68,
+			"hpf_stop": 34,
+			"lpf_start": 174,
+			"lpf_stop": 209,
+			"res_type": "polyphase"
+		},	
+		"3": {
+			"sr": 44100,
+			"hl": 432,
+			"n_fft": 640,
+			"crop_start": 66,
+			"crop_stop": 307,
+			"hpf_start": 86,
+			"hpf_stop": 72,
+			"res_type": "kaiser_fast"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 639,
+	"pre_filter_stop": 640
+}
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100.json
new file mode 100644
index 0000000..4ae850a
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100.json
@@ -0,0 +1,54 @@
+{
+	"bins": 768,
+	"unstable_bins": 7,
+	"reduction_bins": 668,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 1024,
+			"crop_start": 0,
+			"crop_stop": 186,
+			"lpf_start": 37,
+			"lpf_stop": 73,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 512,
+			"crop_start": 4,
+			"crop_stop": 185,			
+			"hpf_start": 36,
+			"hpf_stop": 18,
+			"lpf_start": 93,
+			"lpf_stop": 185,
+			"res_type": "polyphase"
+		},
+		"3": {
+			"sr": 22050,
+			"hl": 256,
+			"n_fft": 512,
+			"crop_start": 46,
+			"crop_stop": 186,
+			"hpf_start": 93,
+			"hpf_stop": 46,
+			"lpf_start": 164,
+			"lpf_stop": 186,
+			"res_type": "polyphase"
+		},	
+		"4": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 768,
+			"crop_start": 121,
+			"crop_stop": 382,
+			"hpf_start": 138,
+			"hpf_stop": 123,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 740,
+	"pre_filter_stop": 768
+}
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_mid.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_mid.json
new file mode 100644
index 0000000..6346701
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_mid.json
@@ -0,0 +1,55 @@
+{
+	"bins": 768,
+	"unstable_bins": 7,
+	"mid_side": true,
+	"reduction_bins": 668,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 1024,
+			"crop_start": 0,
+			"crop_stop": 186,
+			"lpf_start": 37,
+			"lpf_stop": 73,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 512,
+			"crop_start": 4,
+			"crop_stop": 185,			
+			"hpf_start": 36,
+			"hpf_stop": 18,
+			"lpf_start": 93,
+			"lpf_stop": 185,
+			"res_type": "polyphase"
+		},
+		"3": {
+			"sr": 22050,
+			"hl": 256,
+			"n_fft": 512,
+			"crop_start": 46,
+			"crop_stop": 186,
+			"hpf_start": 93,
+			"hpf_stop": 46,
+			"lpf_start": 164,
+			"lpf_stop": 186,
+			"res_type": "polyphase"
+		},	
+		"4": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 768,
+			"crop_start": 121,
+			"crop_stop": 382,
+			"hpf_start": 138,
+			"hpf_stop": 123,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 740,
+	"pre_filter_stop": 768
+}
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_msb.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_msb.json
new file mode 100644
index 0000000..0bf4771
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_msb.json
@@ -0,0 +1,55 @@
+{
+	"mid_side_b": true,
+	"bins": 768,
+	"unstable_bins": 7,
+	"reduction_bins": 668,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 1024,
+			"crop_start": 0,
+			"crop_stop": 186,
+			"lpf_start": 37,
+			"lpf_stop": 73,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 512,
+			"crop_start": 4,
+			"crop_stop": 185,			
+			"hpf_start": 36,
+			"hpf_stop": 18,
+			"lpf_start": 93,
+			"lpf_stop": 185,
+			"res_type": "polyphase"
+		},
+		"3": {
+			"sr": 22050,
+			"hl": 256,
+			"n_fft": 512,
+			"crop_start": 46,
+			"crop_stop": 186,
+			"hpf_start": 93,
+			"hpf_stop": 46,
+			"lpf_start": 164,
+			"lpf_stop": 186,
+			"res_type": "polyphase"
+		},	
+		"4": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 768,
+			"crop_start": 121,
+			"crop_stop": 382,
+			"hpf_start": 138,
+			"hpf_stop": 123,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 740,
+	"pre_filter_stop": 768
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_msb2.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_msb2.json
new file mode 100644
index 0000000..0bf4771
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_msb2.json
@@ -0,0 +1,55 @@
+{
+	"mid_side_b": true,
+	"bins": 768,
+	"unstable_bins": 7,
+	"reduction_bins": 668,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 1024,
+			"crop_start": 0,
+			"crop_stop": 186,
+			"lpf_start": 37,
+			"lpf_stop": 73,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 512,
+			"crop_start": 4,
+			"crop_stop": 185,			
+			"hpf_start": 36,
+			"hpf_stop": 18,
+			"lpf_start": 93,
+			"lpf_stop": 185,
+			"res_type": "polyphase"
+		},
+		"3": {
+			"sr": 22050,
+			"hl": 256,
+			"n_fft": 512,
+			"crop_start": 46,
+			"crop_stop": 186,
+			"hpf_start": 93,
+			"hpf_stop": 46,
+			"lpf_start": 164,
+			"lpf_stop": 186,
+			"res_type": "polyphase"
+		},	
+		"4": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 768,
+			"crop_start": 121,
+			"crop_stop": 382,
+			"hpf_start": 138,
+			"hpf_stop": 123,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 740,
+	"pre_filter_stop": 768
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_reverse.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_reverse.json
new file mode 100644
index 0000000..779a1c9
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_reverse.json
@@ -0,0 +1,55 @@
+{
+	"reverse": true,
+	"bins": 768,
+	"unstable_bins": 7,
+	"reduction_bins": 668,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 1024,
+			"crop_start": 0,
+			"crop_stop": 186,
+			"lpf_start": 37,
+			"lpf_stop": 73,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 512,
+			"crop_start": 4,
+			"crop_stop": 185,			
+			"hpf_start": 36,
+			"hpf_stop": 18,
+			"lpf_start": 93,
+			"lpf_stop": 185,
+			"res_type": "polyphase"
+		},
+		"3": {
+			"sr": 22050,
+			"hl": 256,
+			"n_fft": 512,
+			"crop_start": 46,
+			"crop_stop": 186,
+			"hpf_start": 93,
+			"hpf_stop": 46,
+			"lpf_start": 164,
+			"lpf_stop": 186,
+			"res_type": "polyphase"
+		},	
+		"4": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 768,
+			"crop_start": 121,
+			"crop_stop": 382,
+			"hpf_start": 138,
+			"hpf_stop": 123,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 740,
+	"pre_filter_stop": 768
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_sw.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_sw.json
new file mode 100644
index 0000000..1fefd4a
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_sw.json
@@ -0,0 +1,55 @@
+{
+	"stereo_w": true,
+	"bins": 768,
+	"unstable_bins": 7,
+	"reduction_bins": 668,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 1024,
+			"crop_start": 0,
+			"crop_stop": 186,
+			"lpf_start": 37,
+			"lpf_stop": 73,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 512,
+			"crop_start": 4,
+			"crop_stop": 185,			
+			"hpf_start": 36,
+			"hpf_stop": 18,
+			"lpf_start": 93,
+			"lpf_stop": 185,
+			"res_type": "polyphase"
+		},
+		"3": {
+			"sr": 22050,
+			"hl": 256,
+			"n_fft": 512,
+			"crop_start": 46,
+			"crop_stop": 186,
+			"hpf_start": 93,
+			"hpf_stop": 46,
+			"lpf_start": 164,
+			"lpf_stop": 186,
+			"res_type": "polyphase"
+		},	
+		"4": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 768,
+			"crop_start": 121,
+			"crop_stop": 382,
+			"hpf_start": 138,
+			"hpf_stop": 123,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 740,
+	"pre_filter_stop": 768
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v2.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v2.json
new file mode 100644
index 0000000..af79810
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v2.json
@@ -0,0 +1,54 @@
+{
+	"bins": 672,
+	"unstable_bins": 8,
+	"reduction_bins": 637,
+	"band": {
+		"1": {
+			"sr": 7350,
+			"hl": 80,
+			"n_fft": 640,
+			"crop_start": 0,
+			"crop_stop": 85,
+			"lpf_start": 25,
+			"lpf_stop": 53,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 7350,
+			"hl": 80,
+			"n_fft": 320,
+			"crop_start": 4,
+			"crop_stop": 87,
+			"hpf_start": 25,
+			"hpf_stop": 12,
+			"lpf_start": 31,
+			"lpf_stop": 62,
+			"res_type": "polyphase"
+		},		
+		"3": {
+			"sr": 14700,
+			"hl": 160,
+			"n_fft": 512,
+			"crop_start": 17,
+			"crop_stop": 216,
+			"hpf_start": 48,
+			"hpf_stop": 24,
+			"lpf_start": 139,
+			"lpf_stop": 210,
+			"res_type": "polyphase"
+		},	
+		"4": {
+			"sr": 44100,
+			"hl": 480,
+			"n_fft": 960,
+			"crop_start": 78,
+			"crop_stop": 383,
+			"hpf_start": 130,
+			"hpf_stop": 86,
+			"res_type": "kaiser_fast"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 668,
+	"pre_filter_stop": 672
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v2_sn.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v2_sn.json
new file mode 100644
index 0000000..319b998
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v2_sn.json
@@ -0,0 +1,55 @@
+{
+	"bins": 672,
+	"unstable_bins": 8,
+	"reduction_bins": 637,
+	"band": {
+		"1": {
+			"sr": 7350,
+			"hl": 80,
+			"n_fft": 640,
+			"crop_start": 0,
+			"crop_stop": 85,
+			"lpf_start": 25,
+			"lpf_stop": 53,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 7350,
+			"hl": 80,
+			"n_fft": 320,
+			"crop_start": 4,
+			"crop_stop": 87,
+			"hpf_start": 25,
+			"hpf_stop": 12,
+			"lpf_start": 31,
+			"lpf_stop": 62,
+			"res_type": "polyphase"
+		},		
+		"3": {
+			"sr": 14700,
+			"hl": 160,
+			"n_fft": 512,
+			"crop_start": 17,
+			"crop_stop": 216,
+			"hpf_start": 48,
+			"hpf_stop": 24,
+			"lpf_start": 139,
+			"lpf_stop": 210,
+			"res_type": "polyphase"
+		},	
+		"4": {
+			"sr": 44100,
+			"hl": 480,
+			"n_fft": 960,
+			"crop_start": 78,
+			"crop_stop": 383,
+			"hpf_start": 130,
+			"hpf_stop": 86,
+			"convert_channels": "stereo_n",
+			"res_type": "kaiser_fast"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 668,
+	"pre_filter_stop": 672
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v3.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v3.json
new file mode 100644
index 0000000..2a73bc9
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v3.json
@@ -0,0 +1,54 @@
+{
+	"bins": 672,
+	"unstable_bins": 8,
+	"reduction_bins": 530,
+	"band": {
+		"1": {
+			"sr": 7350,
+			"hl": 80,
+			"n_fft": 640,
+			"crop_start": 0,
+			"crop_stop": 85,
+			"lpf_start": 25,
+			"lpf_stop": 53,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 7350,
+			"hl": 80,
+			"n_fft": 320,
+			"crop_start": 4,
+			"crop_stop": 87,
+			"hpf_start": 25,
+			"hpf_stop": 12,
+			"lpf_start": 31,
+			"lpf_stop": 62,
+			"res_type": "polyphase"
+		},
+		"3": {
+			"sr": 14700,
+			"hl": 160,
+			"n_fft": 512,
+			"crop_start": 17,
+			"crop_stop": 216,
+			"hpf_start": 48,
+			"hpf_stop": 24,
+			"lpf_start": 139,
+			"lpf_stop": 210,
+			"res_type": "polyphase"
+		},
+		"4": {
+			"sr": 44100,
+			"hl": 480,
+			"n_fft": 960,
+			"crop_start": 78,
+			"crop_stop": 383,
+			"hpf_start": 130,
+			"hpf_stop": 86,
+			"res_type": "kaiser_fast"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 668,
+	"pre_filter_stop": 672
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v3_sn.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v3_sn.json
new file mode 100644
index 0000000..6680a06
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v3_sn.json
@@ -0,0 +1,55 @@
+{
+	"n_bins": 672,
+	"unstable_bins": 8,
+	"stable_bins": 530,
+	"band": {
+		"1": {
+			"sr": 7350,
+			"hl": 80,
+			"n_fft": 640,
+			"crop_start": 0,
+			"crop_stop": 85,
+			"lpf_start": 25,
+			"lpf_stop": 53,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 7350,
+			"hl": 80,
+			"n_fft": 320,
+			"crop_start": 4,
+			"crop_stop": 87,
+			"hpf_start": 25,
+			"hpf_stop": 12,
+			"lpf_start": 31,
+			"lpf_stop": 62,
+			"res_type": "polyphase"
+		},
+		"3": {
+			"sr": 14700,
+			"hl": 160,
+			"n_fft": 512,
+			"crop_start": 17,
+			"crop_stop": 216,
+			"hpf_start": 48,
+			"hpf_stop": 24,
+			"lpf_start": 139,
+			"lpf_stop": 210,
+			"res_type": "polyphase"
+		},
+		"4": {
+			"sr": 44100,
+			"hl": 480,
+			"n_fft": 960,
+			"crop_start": 78,
+			"crop_stop": 383,
+			"hpf_start": 130,
+			"hpf_stop": 86,
+			"convert_channels": "stereo_n",
+			"res_type": "kaiser_fast"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 668,
+	"pre_filter_stop": 672
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/ensemble.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/ensemble.json
new file mode 100644
index 0000000..ca96bf1
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/ensemble.json
@@ -0,0 +1,43 @@
+{
+	"mid_side_b2": true,
+	"bins": 1280,
+	"unstable_bins": 7,
+	"reduction_bins": 565,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 108,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 374,
+			"lpf_start": 92,
+			"lpf_stop": 186,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 22050,
+			"hl": 216,
+			"n_fft": 1536,
+			"crop_start": 0,
+			"crop_stop": 424,
+			"hpf_start": 68,
+			"hpf_stop": 34,
+			"lpf_start": 348,
+			"lpf_stop": 418,
+			"res_type": "polyphase"
+		},	
+		"3": {
+			"sr": 44100,
+			"hl": 432,
+			"n_fft": 1280,
+			"crop_start": 132,
+			"crop_stop": 614,
+			"hpf_start": 172,
+			"hpf_stop": 144,
+			"res_type": "polyphase"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 1280,
+	"pre_filter_stop": 1280
+}
\ No newline at end of file
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/nets.py b/audio_separator/separator/uvr_lib_v5/vr_network/nets.py
new file mode 100644
index 0000000..5e1cfbc
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/nets.py
@@ -0,0 +1,175 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from . import layers
+
+
+class BaseASPPNet(nn.Module):
+    """
+    BaseASPPNet Class:
+    This class defines the base architecture for an Atrous Spatial Pyramid Pooling (ASPP) network.
+    It is designed to extract features from input data at multiple scales by using dilated convolutions.
+    This is particularly useful for tasks that benefit from understanding context at different resolutions,
+    such as semantic segmentation. The network consists of a series of encoder layers for downsampling and feature extraction,
+    followed by an ASPP module for multi-scale feature extraction, and finally a series of decoder layers for upsampling.
+    """
+
+    def __init__(self, nn_architecture, nin, ch, dilations=(4, 8, 16)):
+        super(BaseASPPNet, self).__init__()
+        self.nn_architecture = nn_architecture
+
+        # Encoder layers progressively increase the number of channels while reducing spatial dimensions.
+        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+        self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+        self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+        self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+        # Depending on the network architecture, an additional encoder layer and a specific ASPP module are initialized.
+        if self.nn_architecture == 129605:
+            self.enc5 = layers.Encoder(ch * 8, ch * 16, 3, 2, 1)
+            self.aspp = layers.ASPPModule(nn_architecture, ch * 16, ch * 32, dilations)
+            self.dec5 = layers.Decoder(ch * (16 + 32), ch * 16, 3, 1, 1)
+        else:
+            self.aspp = layers.ASPPModule(nn_architecture, ch * 8, ch * 16, dilations)
+
+        # Decoder layers progressively decrease the number of channels while increasing spatial dimensions.
+        self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+        self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+        self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+        self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+    def __call__(self, input_tensor):
+        # The input tensor is passed through a series of encoder layers.
+        hidden_state, encoder_output1 = self.enc1(input_tensor)
+        hidden_state, encoder_output2 = self.enc2(hidden_state)
+        hidden_state, encoder_output3 = self.enc3(hidden_state)
+        hidden_state, encoder_output4 = self.enc4(hidden_state)
+
+        # Depending on the network architecture, the hidden state is processed by an additional encoder layer and the ASPP module.
+        if self.nn_architecture == 129605:
+            hidden_state, encoder_output5 = self.enc5(hidden_state)
+            hidden_state = self.aspp(hidden_state)
+            # The decoder layers use skip connections from the encoder layers for better feature integration.
+            hidden_state = self.dec5(hidden_state, encoder_output5)
+        else:
+            hidden_state = self.aspp(hidden_state)
+
+        # The hidden state is further processed by the decoder layers, using skip connections for feature integration.
+        hidden_state = self.dec4(hidden_state, encoder_output4)
+        hidden_state = self.dec3(hidden_state, encoder_output3)
+        hidden_state = self.dec2(hidden_state, encoder_output2)
+        hidden_state = self.dec1(hidden_state, encoder_output1)
+
+        return hidden_state
+
+
+def determine_model_capacity(n_fft_bins, nn_architecture):
+    """
+    The determine_model_capacity function is designed to select the appropriate model configuration
+    based on the frequency bins and network architecture. It maps specific architectures to predefined
+    model capacities, which dictate the structure and parameters of the CascadedASPPNet model.
+    """
+
+    # Predefined model architectures categorized by their precision level.
+    sp_model_arch = [31191, 33966, 129605]
+    hp_model_arch = [123821, 123812]
+    hp2_model_arch = [537238, 537227]
+
+    # Mapping network architectures to their corresponding model capacity data.
+    if nn_architecture in sp_model_arch:
+        model_capacity_data = [(2, 16), (2, 16), (18, 8, 1, 1, 0), (8, 16), (34, 16, 1, 1, 0), (16, 32), (32, 2, 1), (16, 2, 1), (16, 2, 1)]
+
+    if nn_architecture in hp_model_arch:
+        model_capacity_data = [(2, 32), (2, 32), (34, 16, 1, 1, 0), (16, 32), (66, 32, 1, 1, 0), (32, 64), (64, 2, 1), (32, 2, 1), (32, 2, 1)]
+
+    if nn_architecture in hp2_model_arch:
+        model_capacity_data = [(2, 64), (2, 64), (66, 32, 1, 1, 0), (32, 64), (130, 64, 1, 1, 0), (64, 128), (128, 2, 1), (64, 2, 1), (64, 2, 1)]
+
+    # Initializing the CascadedASPPNet model with the selected model capacity data.
+    cascaded = CascadedASPPNet
+    model = cascaded(n_fft_bins, model_capacity_data, nn_architecture)
+
+    return model
+
+
+class CascadedASPPNet(nn.Module):
+    """
+    CascadedASPPNet Class:
+    This class implements a cascaded version of the ASPP network, designed for processing audio signals
+    for tasks such as vocal removal. It consists of multiple stages, each with its own ASPP network,
+    to process different frequency bands of the input signal. This allows the model to effectively
+    handle the full spectrum of audio frequencies by focusing on different frequency bands separately.
+    """
+
+    def __init__(self, n_fft, model_capacity_data, nn_architecture):
+        super(CascadedASPPNet, self).__init__()
+        # The first stage processes the low and high frequency bands separately.
+        self.stg1_low_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[0])
+        self.stg1_high_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[1])
+
+        # Bridge layers connect different stages of the network.
+        self.stg2_bridge = layers.Conv2DBNActiv(*model_capacity_data[2])
+        self.stg2_full_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[3])
+
+        self.stg3_bridge = layers.Conv2DBNActiv(*model_capacity_data[4])
+        self.stg3_full_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[5])
+
+        # Output layers for the final mask prediction and auxiliary outputs.
+        self.out = nn.Conv2d(*model_capacity_data[6], bias=False)
+        self.aux1_out = nn.Conv2d(*model_capacity_data[7], bias=False)
+        self.aux2_out = nn.Conv2d(*model_capacity_data[8], bias=False)
+
+        # Parameters for handling the frequency bins of the input signal.
+        self.max_bin = n_fft // 2
+        self.output_bin = n_fft // 2 + 1
+
+        self.offset = 128
+
+    def forward(self, input_tensor):
+        # The forward pass processes the input tensor through each stage of the network,
+        # combining the outputs of different frequency bands and stages to produce the final mask.
+        mix = input_tensor.detach()
+        input_tensor = input_tensor.clone()
+
+        # Preparing the input tensor by selecting the mainput_tensorimum frequency bin.
+        input_tensor = input_tensor[:, :, : self.max_bin]
+
+        # Processing the low and high frequency bands separately in the first stage.
+        bandwidth = input_tensor.size()[2] // 2
+        aux1 = torch.cat([self.stg1_low_band_net(input_tensor[:, :, :bandwidth]), self.stg1_high_band_net(input_tensor[:, :, bandwidth:])], dim=2)
+
+        # Combining the outputs of the first stage and passing through the second stage.
+        hidden_state = torch.cat([input_tensor, aux1], dim=1)
+        aux2 = self.stg2_full_band_net(self.stg2_bridge(hidden_state))
+
+        # Further processing the combined outputs through the third stage.
+        hidden_state = torch.cat([input_tensor, aux1, aux2], dim=1)
+        hidden_state = self.stg3_full_band_net(self.stg3_bridge(hidden_state))
+
+        # Applying the final output layer to produce the mask.
+        mask = torch.sigmoid(self.out(hidden_state))
+
+        # Padding the mask to match the output frequency bin size.
+        mask = F.pad(input=mask, pad=(0, 0, 0, self.output_bin - mask.size()[2]), mode="replicate")
+
+        # During training, auxiliary outputs are also produced and padded accordingly.
+        if self.training:
+            aux1 = torch.sigmoid(self.aux1_out(aux1))
+            aux1 = F.pad(input=aux1, pad=(0, 0, 0, self.output_bin - aux1.size()[2]), mode="replicate")
+            aux2 = torch.sigmoid(self.aux2_out(aux2))
+            aux2 = F.pad(input=aux2, pad=(0, 0, 0, self.output_bin - aux2.size()[2]), mode="replicate")
+            return mask * mix, aux1 * mix, aux2 * mix
+        else:
+            return mask  # * mix
+
+    def predict_mask(self, input_tensor):
+        # This method predicts the mask for the input tensor by calling the forward method
+        # and applying any necessary padding adjustments.
+        mask = self.forward(input_tensor)
+
+        # Adjusting the mask by removing padding offsets if present.
+        if self.offset > 0:
+            mask = mask[:, :, :, self.offset : -self.offset]
+
+        return mask
diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/nets_new.py b/audio_separator/separator/uvr_lib_v5/vr_network/nets_new.py
new file mode 100644
index 0000000..f49065f
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/vr_network/nets_new.py
@@ -0,0 +1,160 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+from . import layers_new as layers
+
+
+class BaseNet(nn.Module):
+    """
+    BaseNet Class:
+    This class defines the base network architecture for vocal removal. It includes a series of encoders for feature extraction,
+    an ASPP module for capturing multi-scale context, and a series of decoders for reconstructing the output. Additionally,
+    it incorporates an LSTM module for capturing temporal dependencies.
+    """
+
+    def __init__(self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))):
+        super(BaseNet, self).__init__()
+        # Initialize the encoder layers with increasing output channels for hierarchical feature extraction.
+        self.enc1 = layers.Conv2DBNActiv(nin, nout, 3, 1, 1)
+        self.enc2 = layers.Encoder(nout, nout * 2, 3, 2, 1)
+        self.enc3 = layers.Encoder(nout * 2, nout * 4, 3, 2, 1)
+        self.enc4 = layers.Encoder(nout * 4, nout * 6, 3, 2, 1)
+        self.enc5 = layers.Encoder(nout * 6, nout * 8, 3, 2, 1)
+
+        # ASPP module for capturing multi-scale features with different dilation rates.
+        self.aspp = layers.ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
+
+        # Decoder layers for upscaling and merging features from different levels of the encoder and ASPP module.
+        self.dec4 = layers.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
+        self.dec3 = layers.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
+        self.dec2 = layers.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
+
+        # LSTM module for capturing temporal dependencies in the sequence of features.
+        self.lstm_dec2 = layers.LSTMModule(nout * 2, nin_lstm, nout_lstm)
+        self.dec1 = layers.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
+
+    def __call__(self, input_tensor):
+        # Sequentially pass the input through the encoder layers.
+        encoded1 = self.enc1(input_tensor)
+        encoded2 = self.enc2(encoded1)
+        encoded3 = self.enc3(encoded2)
+        encoded4 = self.enc4(encoded3)
+        encoded5 = self.enc5(encoded4)
+
+        # Pass the deepest encoder output through the ASPP module.
+        bottleneck = self.aspp(encoded5)
+
+        # Sequentially upscale and merge the features using the decoder layers.
+        bottleneck = self.dec4(bottleneck, encoded4)
+        bottleneck = self.dec3(bottleneck, encoded3)
+        bottleneck = self.dec2(bottleneck, encoded2)
+        # Concatenate the LSTM module output for temporal feature enhancement.
+        bottleneck = torch.cat([bottleneck, self.lstm_dec2(bottleneck)], dim=1)
+        bottleneck = self.dec1(bottleneck, encoded1)
+
+        return bottleneck
+
+
+class CascadedNet(nn.Module):
+    """
+    CascadedNet Class:
+    This class defines a cascaded network architecture that processes input in multiple stages, each stage focusing on different frequency bands.
+    It utilizes the BaseNet for processing, and combines outputs from different stages to produce the final mask for vocal removal.
+    """
+
+    def __init__(self, n_fft, nn_arch_size=51000, nout=32, nout_lstm=128):
+        super(CascadedNet, self).__init__()
+        # Calculate frequency bins based on FFT size.
+        self.max_bin = n_fft // 2
+        self.output_bin = n_fft // 2 + 1
+        self.nin_lstm = self.max_bin // 2
+        self.offset = 64
+        # Adjust output channels based on the architecture size.
+        nout = 64 if nn_arch_size == 218409 else nout
+
+        # print(nout, nout_lstm, n_fft)
+
+        # Initialize the network stages, each focusing on different frequency bands and progressively refining the output.
+        self.stg1_low_band_net = nn.Sequential(BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm), layers.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0))
+        self.stg1_high_band_net = BaseNet(2, nout // 4, self.nin_lstm // 2, nout_lstm // 2)
+
+        self.stg2_low_band_net = nn.Sequential(BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm), layers.Conv2DBNActiv(nout, nout // 2, 1, 1, 0))
+        self.stg2_high_band_net = BaseNet(nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2)
+
+        self.stg3_full_band_net = BaseNet(3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm)
+
+        # Output layer for generating the final mask.
+        self.out = nn.Conv2d(nout, 2, 1, bias=False)
+        # Auxiliary output layer for intermediate supervision during training.
+        self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
+
+    def forward(self, input_tensor):
+        # Preprocess input tensor to match the maximum frequency bin.
+        input_tensor = input_tensor[:, :, : self.max_bin]
+
+        # Split the input into low and high frequency bands.
+        bandw = input_tensor.size()[2] // 2
+        l1_in = input_tensor[:, :, :bandw]
+        h1_in = input_tensor[:, :, bandw:]
+
+        # Process each band through the first stage networks.
+        l1 = self.stg1_low_band_net(l1_in)
+        h1 = self.stg1_high_band_net(h1_in)
+
+        # Combine the outputs for auxiliary supervision.
+        aux1 = torch.cat([l1, h1], dim=2)
+
+        # Prepare inputs for the second stage by concatenating the original and processed bands.
+        l2_in = torch.cat([l1_in, l1], dim=1)
+        h2_in = torch.cat([h1_in, h1], dim=1)
+
+        # Process through the second stage networks.
+        l2 = self.stg2_low_band_net(l2_in)
+        h2 = self.stg2_high_band_net(h2_in)
+
+        # Combine the outputs for auxiliary supervision.
+        aux2 = torch.cat([l2, h2], dim=2)
+
+        # Prepare input for the third stage by concatenating all previous outputs with the original input.
+        f3_in = torch.cat([x, aux1, aux2], dim=1)
+
+        # Process through the third stage network.
+        f3 = self.stg3_full_band_net(f3_in)
+
+        # Apply the output layer to generate the final mask and apply sigmoid for normalization.
+        mask = torch.sigmoid(self.out(f3))
+
+        # Pad the mask to match the output frequency bin size.
+        mask = F.pad(input=mask, pad=(0, 0, 0, self.output_bin - mask.size()[2]), mode="replicate")
+
+        # During training, generate and pad the auxiliary output for additional supervision.
+        if self.training:
+            aux = torch.cat([aux1, aux2], dim=1)
+            aux = torch.sigmoid(self.aux_out(aux))
+            aux = F.pad(input=aux, pad=(0, 0, 0, self.output_bin - aux.size()[2]), mode="replicate")
+            return mask, aux
+        else:
+            return mask
+
+    # Method for predicting the mask given an input tensor.
+    def predict_mask(self, input_tensor):
+        mask = self.forward(input_tensor)
+
+        # If an offset is specified, crop the mask to remove edge artifacts.
+        if self.offset > 0:
+            mask = mask[:, :, :, self.offset : -self.offset]
+            assert mask.size()[3] > 0
+
+        return mask
+
+    # Method for applying the predicted mask to the input tensor to obtain the predicted magnitude.
+    def predict(self, input_tensor):
+        mask = self.forward(input_tensor)
+        pred_mag = input_tensor * mask
+
+        # If an offset is specified, crop the predicted magnitude to remove edge artifacts.
+        if self.offset > 0:
+            pred_mag = pred_mag[:, :, :, self.offset : -self.offset]
+            assert pred_mag.size()[3] > 0
+
+        return pred_mag
diff --git a/audio_separator/utils/cli.py b/audio_separator/utils/cli.py
index 060e7cc..c8b4a35 100755
--- a/audio_separator/utils/cli.py
+++ b/audio_separator/utils/cli.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 import argparse
 import logging
+import json
 from importlib import metadata
 
 
@@ -11,106 +12,59 @@ def main():
     log_handler.setFormatter(log_formatter)
     logger.addHandler(log_handler)
 
-    parser = argparse.ArgumentParser(
-        description="Separate audio file into different stems.",
-        formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, max_help_position=45),
-    )
+    parser = argparse.ArgumentParser(description="Separate audio file into different stems.", formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, max_help_position=45))
 
     parser.add_argument("audio_file", nargs="?", help="The audio file path to separate, in any common format.", default=argparse.SUPPRESS)
 
     package_version = metadata.distribution("audio-separator").version
     parser.add_argument("-v", "--version", action="version", version=f"%(prog)s {package_version}")
 
-    parser.add_argument(
-        "--log_level",
-        default="info",
-        help="Optional: logging level, e.g. info, debug, warning (default: %(default)s). Example: --log_level=debug",
-    )
+    parser.add_argument("--log_level", default="info", help="Optional: logging level, e.g. info, debug, warning (default: %(default)s). Example: --log_level=debug")
 
-    parser.add_argument(
-        "--model_name",
-        default="UVR-MDX-NET-Inst_HQ_3",
-        help="Optional: model name to be used for separation (default: %(default)s). Example: --model_name=UVR_MDXNET_KARA_2",
-    )
+    parser.add_argument("--list_models", action="store_true", help="List all supported models and exit.")
 
     parser.add_argument(
-        "--model_file_dir",
-        default="/tmp/audio-separator-models/",
-        help="Optional: model files directory (default: %(default)s). Example: --model_file_dir=/app/models",
+        "--model_filename", default="2_HP-UVR.pth", help="Optional: model filename to be used for separation (default: %(default)s). Example: --model_filename=UVR_MDXNET_KARA_2.onnx"
     )
 
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        help="Optional: directory to write output files (default: <current dir>). Example: --output_dir=/app/separated",
-    )
+    parser.add_argument("--model_file_dir", default="/tmp/audio-separator-models/", help="Optional: model files directory (default: %(default)s). Example: --model_file_dir=/app/models")
 
-    parser.add_argument(
-        "--output_format",
-        default="FLAC",
-        help="Optional: output format for separated files, any common format (default: %(default)s). Example: --output_format=MP3",
-    )
+    parser.add_argument("--output_dir", default=None, help="Optional: directory to write output files (default: <current dir>). Example: --output_dir=/app/separated")
 
-    parser.add_argument(
-        "--denoise",
-        type=lambda x: (str(x).lower() == "true"),
-        default=False,
-        help="Optional: enable or disable denoising during separation (default: %(default)s). Example: --denoise=True",
-    )
+    parser.add_argument("--output_format", default="FLAC", help="Optional: output format for separated files, any common format (default: %(default)s). Example: --output_format=MP3")
 
     parser.add_argument(
-        "--normalization_threshold",
-        type=float,
-        default=0.9,
-        help="Optional: max peak amplitude to normalize input and output audio to (default: %(default)s). Example: --normalization_threshold=0.7",
+        "--denoise", type=lambda x: (str(x).lower() == "true"), default=False, help="Optional: enable or disable denoising during separation (default: %(default)s). Example: --denoise=True"
     )
 
     parser.add_argument(
-        "--single_stem",
-        default=None,
-        help="Optional: output only single stem, either instrumental or vocals. Example: --single_stem=instrumental",
+        "--normalization_threshold", type=float, default=0.9, help="Optional: max peak amplitude to normalize input and output audio to (default: %(default)s). Example: --normalization_threshold=0.7"
     )
 
-    parser.add_argument(
-        "--invert_spect",
-        type=lambda x: (str(x).lower() == "true"),
-        default=False,
-        help="Optional: invert secondary stem using spectogram (default: %(default)s). Example: --invert_spect=True",
-    )
+    parser.add_argument("--single_stem", default=None, help="Optional: output only single stem, either instrumental or vocals. Example: --single_stem=instrumental")
 
     parser.add_argument(
-        "--sample_rate",
-        type=int,
-        default=44100,
-        help="Optional: sample_rate (default: %(default)s). Example: --sample_rate=44100",
+        "--invert_spect", type=lambda x: (str(x).lower() == "true"), default=False, help="Optional: invert secondary stem using spectogram (default: %(default)s). Example: --invert_spect=True"
     )
 
-    parser.add_argument(
-        "--hop_length",
-        type=int,
-        default=1024,
-        help="Optional: hop_length (default: %(default)s). Example: --hop_length=1024",
-    )
+    parser.add_argument("--sample_rate", type=int, default=44100, help="Optional: sample_rate (default: %(default)s). Example: --sample_rate=44100")
 
-    parser.add_argument(
-        "--segment_size",
-        type=int,
-        default=256,
-        help="Optional: segment_size (default: %(default)s). Example: --segment_size=256",
-    )
+    parser.add_argument("--mdx_hop_length", type=int, default=1024, help="Optional: mdx_hop_length (default: %(default)s). Example: --mdx_hop_length=1024")
+    parser.add_argument("--mdx_segment_size", type=int, default=256, help="Optional: mdx_segment_size (default: %(default)s). Example: --mdx_segment_size=256")
+    parser.add_argument("--mdx_overlap", type=float, default=0.25, help="Optional: mdx_overlap (default: %(default)s). Example: --mdx_overlap=0.25")
+    parser.add_argument("--mdx_batch_size", type=int, default=1, help="Optional: mdx_batch_size (default: %(default)s). Example: --mdx_batch_size=4")
+
+    parser.add_argument("--vr_batch_size", type=int, default=4, help="Optional: vr_batch_size (default: %(default)s). Example: --vr_batch_size=16")
+    parser.add_argument("--vr_window_size", type=int, default=512, help="Optional: vr_window_size (default: %(default)s). Example: --vr_window_size=256")
+    parser.add_argument("--vr_aggression", type=int, default=5, help="Optional: vr_aggression (default: %(default)s). Example: --vr_aggression=2")
 
+    parser.add_argument("--vr_enable_tta", type=lambda x: (str(x).lower() == "true"), default=False, help="Optional: vr_enable_tta (default: %(default)s). Example: --vr_enable_tta=True")
     parser.add_argument(
-        "--overlap",
-        type=float,
-        default=0.25,
-        help="Optional: overlap (default: %(default)s). Example: --overlap=0.25",
+        "--vr_enable_post_process", type=lambda x: (str(x).lower() == "true"), default=False, help="Optional: vr_enable_post_process (default: %(default)s). Example: --vr_enable_post_process=True"
     )
-
+    parser.add_argument("--vr_post_process_threshold", type=float, default=0.2, help="Optional: vr_post_process_threshold (default: %(default)s). Example: --vr_post_process_threshold=0.1")
     parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=1,
-        help="Optional: batch_size (default: %(default)s). Example: --batch_size=1",
+        "--vr_high_end_process", type=lambda x: (str(x).lower() == "true"), default=False, help="Optional: vr_high_end_process (default: %(default)s). Example: --vr_high_end_process=True"
     )
 
     args = parser.parse_args()
@@ -118,6 +72,13 @@ def main():
     log_level = getattr(logging, args.log_level.upper())
     logger.setLevel(log_level)
 
+    if args.list_models:
+        from audio_separator.separator import Separator
+
+        separator = Separator()
+        print(json.dumps(separator.list_supported_model_files(), indent=4, sort_keys=True))
+        exit(0)
+
     if not hasattr(args, "audio_file"):
         parser.print_help()
         exit(1)
@@ -133,18 +94,24 @@ def main():
         model_file_dir=args.model_file_dir,
         output_dir=args.output_dir,
         output_format=args.output_format,
-        denoise_enabled=args.denoise,
+        enable_denoise=args.denoise,
         normalization_threshold=args.normalization_threshold,
         output_single_stem=args.single_stem,
         invert_using_spec=args.invert_spect,
         sample_rate=args.sample_rate,
-        hop_length=args.hop_length,
-        segment_size=args.segment_size,
-        overlap=args.overlap,
-        batch_size=args.batch_size,
-    )
-
-    separator.load_model(args.model_name)
+        mdx_params={"hop_length": args.mdx_hop_length, "segment_size": args.mdx_segment_size, "overlap": args.mdx_overlap, "batch_size": args.mdx_batch_size},
+        vr_params={
+            "batch_size": args.vr_batch_size,
+            "window_size": args.vr_window_size,
+            "aggression": args.vr_aggression,
+            "enable_tta": args.vr_enable_tta,
+            "enable_post_process": args.vr_enable_post_process,
+            "post_process_threshold": args.vr_post_process_threshold,
+            "high_end_process": args.vr_high_end_process,
+        },
+    )
+
+    separator.load_model(args.model_filename)
 
     output_files = separator.separate(args.audio_file)
 
diff --git a/poetry.lock b/poetry.lock
index 30be610..ed0356b 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -62,13 +62,13 @@ uvloop = ["uvloop (>=0.15.2)"]
 
 [[package]]
 name = "certifi"
-version = "2023.11.17"
+version = "2024.2.2"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2023.11.17-py3-none-any.whl", hash = "sha256:e036ab49d5b79556f99cfc2d9320b34cfbe5be05c5871b51de9329f0603b0474"},
-    {file = "certifi-2023.11.17.tar.gz", hash = "sha256:9b469f3a900bf28dc19b8cfbf8019bf47f7fdd1a65a1d4ffb98fc14166beb4d1"},
+    {file = "certifi-2024.2.2-py3-none-any.whl", hash = "sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1"},
+    {file = "certifi-2024.2.2.tar.gz", hash = "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f"},
 ]
 
 [[package]]
@@ -542,104 +542,101 @@ tests = ["matplotlib (>=3.3.0)", "packaging (>=20.0)", "pytest", "pytest-cov", "
 
 [[package]]
 name = "llvmlite"
-version = "0.41.1"
+version = "0.42.0"
 description = "lightweight wrapper around basic LLVM functionality"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "llvmlite-0.41.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c1e1029d47ee66d3a0c4d6088641882f75b93db82bd0e6178f7bd744ebce42b9"},
-    {file = "llvmlite-0.41.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:150d0bc275a8ac664a705135e639178883293cf08c1a38de3bbaa2f693a0a867"},
-    {file = "llvmlite-0.41.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1eee5cf17ec2b4198b509272cf300ee6577229d237c98cc6e63861b08463ddc6"},
-    {file = "llvmlite-0.41.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dd0338da625346538f1173a17cabf21d1e315cf387ca21b294ff209d176e244"},
-    {file = "llvmlite-0.41.1-cp310-cp310-win32.whl", hash = "sha256:fa1469901a2e100c17eb8fe2678e34bd4255a3576d1a543421356e9c14d6e2ae"},
-    {file = "llvmlite-0.41.1-cp310-cp310-win_amd64.whl", hash = "sha256:2b76acee82ea0e9304be6be9d4b3840208d050ea0dcad75b1635fa06e949a0ae"},
-    {file = "llvmlite-0.41.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:210e458723436b2469d61b54b453474e09e12a94453c97ea3fbb0742ba5a83d8"},
-    {file = "llvmlite-0.41.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:855f280e781d49e0640aef4c4af586831ade8f1a6c4df483fb901cbe1a48d127"},
-    {file = "llvmlite-0.41.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b67340c62c93a11fae482910dc29163a50dff3dfa88bc874872d28ee604a83be"},
-    {file = "llvmlite-0.41.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2181bb63ef3c607e6403813421b46982c3ac6bfc1f11fa16a13eaafb46f578e6"},
-    {file = "llvmlite-0.41.1-cp311-cp311-win_amd64.whl", hash = "sha256:9564c19b31a0434f01d2025b06b44c7ed422f51e719ab5d24ff03b7560066c9a"},
-    {file = "llvmlite-0.41.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5940bc901fb0325970415dbede82c0b7f3e35c2d5fd1d5e0047134c2c46b3281"},
-    {file = "llvmlite-0.41.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8b0a9a47c28f67a269bb62f6256e63cef28d3c5f13cbae4fab587c3ad506778b"},
-    {file = "llvmlite-0.41.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8afdfa6da33f0b4226af8e64cfc2b28986e005528fbf944d0a24a72acfc9432"},
-    {file = "llvmlite-0.41.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8454c1133ef701e8c050a59edd85d238ee18bb9a0eb95faf2fca8b909ee3c89a"},
-    {file = "llvmlite-0.41.1-cp38-cp38-win32.whl", hash = "sha256:2d92c51e6e9394d503033ffe3292f5bef1566ab73029ec853861f60ad5c925d0"},
-    {file = "llvmlite-0.41.1-cp38-cp38-win_amd64.whl", hash = "sha256:df75594e5a4702b032684d5481db3af990b69c249ccb1d32687b8501f0689432"},
-    {file = "llvmlite-0.41.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:04725975e5b2af416d685ea0769f4ecc33f97be541e301054c9f741003085802"},
-    {file = "llvmlite-0.41.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bf14aa0eb22b58c231243dccf7e7f42f7beec48970f2549b3a6acc737d1a4ba4"},
-    {file = "llvmlite-0.41.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92c32356f669e036eb01016e883b22add883c60739bc1ebee3a1cc0249a50828"},
-    {file = "llvmlite-0.41.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24091a6b31242bcdd56ae2dbea40007f462260bc9bdf947953acc39dffd54f8f"},
-    {file = "llvmlite-0.41.1-cp39-cp39-win32.whl", hash = "sha256:880cb57ca49e862e1cd077104375b9d1dfdc0622596dfa22105f470d7bacb309"},
-    {file = "llvmlite-0.41.1-cp39-cp39-win_amd64.whl", hash = "sha256:92f093986ab92e71c9ffe334c002f96defc7986efda18397d0f08534f3ebdc4d"},
-    {file = "llvmlite-0.41.1.tar.gz", hash = "sha256:f19f767a018e6ec89608e1f6b13348fa2fcde657151137cb64e56d48598a92db"},
+    {file = "llvmlite-0.42.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3366938e1bf63d26c34fbfb4c8e8d2ded57d11e0567d5bb243d89aab1eb56098"},
+    {file = "llvmlite-0.42.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c35da49666a21185d21b551fc3caf46a935d54d66969d32d72af109b5e7d2b6f"},
+    {file = "llvmlite-0.42.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70f44ccc3c6220bd23e0ba698a63ec2a7d3205da0d848804807f37fc243e3f77"},
+    {file = "llvmlite-0.42.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:763f8d8717a9073b9e0246998de89929071d15b47f254c10eef2310b9aac033d"},
+    {file = "llvmlite-0.42.0-cp310-cp310-win_amd64.whl", hash = "sha256:8d90edf400b4ceb3a0e776b6c6e4656d05c7187c439587e06f86afceb66d2be5"},
+    {file = "llvmlite-0.42.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ae511caed28beaf1252dbaf5f40e663f533b79ceb408c874c01754cafabb9cbf"},
+    {file = "llvmlite-0.42.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81e674c2fe85576e6c4474e8c7e7aba7901ac0196e864fe7985492b737dbab65"},
+    {file = "llvmlite-0.42.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb3975787f13eb97629052edb5017f6c170eebc1c14a0433e8089e5db43bcce6"},
+    {file = "llvmlite-0.42.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5bece0cdf77f22379f19b1959ccd7aee518afa4afbd3656c6365865f84903f9"},
+    {file = "llvmlite-0.42.0-cp311-cp311-win_amd64.whl", hash = "sha256:7e0c4c11c8c2aa9b0701f91b799cb9134a6a6de51444eff5a9087fc7c1384275"},
+    {file = "llvmlite-0.42.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:08fa9ab02b0d0179c688a4216b8939138266519aaa0aa94f1195a8542faedb56"},
+    {file = "llvmlite-0.42.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b2fce7d355068494d1e42202c7aff25d50c462584233013eb4470c33b995e3ee"},
+    {file = "llvmlite-0.42.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ebe66a86dc44634b59a3bc860c7b20d26d9aaffcd30364ebe8ba79161a9121f4"},
+    {file = "llvmlite-0.42.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47494552559e00d81bfb836cf1c4d5a5062e54102cc5767d5aa1e77ccd2505c"},
+    {file = "llvmlite-0.42.0-cp312-cp312-win_amd64.whl", hash = "sha256:05cb7e9b6ce69165ce4d1b994fbdedca0c62492e537b0cc86141b6e2c78d5888"},
+    {file = "llvmlite-0.42.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bdd3888544538a94d7ec99e7c62a0cdd8833609c85f0c23fcb6c5c591aec60ad"},
+    {file = "llvmlite-0.42.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0936c2067a67fb8816c908d5457d63eba3e2b17e515c5fe00e5ee2bace06040"},
+    {file = "llvmlite-0.42.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a78ab89f1924fc11482209f6799a7a3fc74ddc80425a7a3e0e8174af0e9e2301"},
+    {file = "llvmlite-0.42.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7599b65c7af7abbc978dbf345712c60fd596aa5670496561cc10e8a71cebfb2"},
+    {file = "llvmlite-0.42.0-cp39-cp39-win_amd64.whl", hash = "sha256:43d65cc4e206c2e902c1004dd5418417c4efa6c1d04df05c6c5675a27e8ca90e"},
+    {file = "llvmlite-0.42.0.tar.gz", hash = "sha256:f92b09243c0cc3f457da8b983f67bd8e1295d0f5b3746c7a1861d7a99403854a"},
 ]
 
 [[package]]
 name = "markupsafe"
-version = "2.1.4"
+version = "2.1.5"
 description = "Safely add untrusted strings to HTML/XML markup."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "MarkupSafe-2.1.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:de8153a7aae3835484ac168a9a9bdaa0c5eee4e0bc595503c95d53b942879c84"},
-    {file = "MarkupSafe-2.1.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e888ff76ceb39601c59e219f281466c6d7e66bd375b4ec1ce83bcdc68306796b"},
-    {file = "MarkupSafe-2.1.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0b838c37ba596fcbfca71651a104a611543077156cb0a26fe0c475e1f152ee8"},
-    {file = "MarkupSafe-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dac1ebf6983148b45b5fa48593950f90ed6d1d26300604f321c74a9ca1609f8e"},
-    {file = "MarkupSafe-2.1.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0fbad3d346df8f9d72622ac71b69565e621ada2ce6572f37c2eae8dacd60385d"},
-    {file = "MarkupSafe-2.1.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d5291d98cd3ad9a562883468c690a2a238c4a6388ab3bd155b0c75dd55ece858"},
-    {file = "MarkupSafe-2.1.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a7cc49ef48a3c7a0005a949f3c04f8baa5409d3f663a1b36f0eba9bfe2a0396e"},
-    {file = "MarkupSafe-2.1.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b83041cda633871572f0d3c41dddd5582ad7d22f65a72eacd8d3d6d00291df26"},
-    {file = "MarkupSafe-2.1.4-cp310-cp310-win32.whl", hash = "sha256:0c26f67b3fe27302d3a412b85ef696792c4a2386293c53ba683a89562f9399b0"},
-    {file = "MarkupSafe-2.1.4-cp310-cp310-win_amd64.whl", hash = "sha256:a76055d5cb1c23485d7ddae533229039b850db711c554a12ea64a0fd8a0129e2"},
-    {file = "MarkupSafe-2.1.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9e9e3c4020aa2dc62d5dd6743a69e399ce3de58320522948af6140ac959ab863"},
-    {file = "MarkupSafe-2.1.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0042d6a9880b38e1dd9ff83146cc3c9c18a059b9360ceae207805567aacccc69"},
-    {file = "MarkupSafe-2.1.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55d03fea4c4e9fd0ad75dc2e7e2b6757b80c152c032ea1d1de487461d8140efc"},
-    {file = "MarkupSafe-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ab3a886a237f6e9c9f4f7d272067e712cdb4efa774bef494dccad08f39d8ae6"},
-    {file = "MarkupSafe-2.1.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abf5ebbec056817057bfafc0445916bb688a255a5146f900445d081db08cbabb"},
-    {file = "MarkupSafe-2.1.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e1a0d1924a5013d4f294087e00024ad25668234569289650929ab871231668e7"},
-    {file = "MarkupSafe-2.1.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:e7902211afd0af05fbadcc9a312e4cf10f27b779cf1323e78d52377ae4b72bea"},
-    {file = "MarkupSafe-2.1.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c669391319973e49a7c6230c218a1e3044710bc1ce4c8e6eb71f7e6d43a2c131"},
-    {file = "MarkupSafe-2.1.4-cp311-cp311-win32.whl", hash = "sha256:31f57d64c336b8ccb1966d156932f3daa4fee74176b0fdc48ef580be774aae74"},
-    {file = "MarkupSafe-2.1.4-cp311-cp311-win_amd64.whl", hash = "sha256:54a7e1380dfece8847c71bf7e33da5d084e9b889c75eca19100ef98027bd9f56"},
-    {file = "MarkupSafe-2.1.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:a76cd37d229fc385738bd1ce4cba2a121cf26b53864c1772694ad0ad348e509e"},
-    {file = "MarkupSafe-2.1.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:987d13fe1d23e12a66ca2073b8d2e2a75cec2ecb8eab43ff5624ba0ad42764bc"},
-    {file = "MarkupSafe-2.1.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5244324676254697fe5c181fc762284e2c5fceeb1c4e3e7f6aca2b6f107e60dc"},
-    {file = "MarkupSafe-2.1.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78bc995e004681246e85e28e068111a4c3f35f34e6c62da1471e844ee1446250"},
-    {file = "MarkupSafe-2.1.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a4d176cfdfde84f732c4a53109b293d05883e952bbba68b857ae446fa3119b4f"},
-    {file = "MarkupSafe-2.1.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f9917691f410a2e0897d1ef99619fd3f7dd503647c8ff2475bf90c3cf222ad74"},
-    {file = "MarkupSafe-2.1.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:f06e5a9e99b7df44640767842f414ed5d7bedaaa78cd817ce04bbd6fd86e2dd6"},
-    {file = "MarkupSafe-2.1.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:396549cea79e8ca4ba65525470d534e8a41070e6b3500ce2414921099cb73e8d"},
-    {file = "MarkupSafe-2.1.4-cp312-cp312-win32.whl", hash = "sha256:f6be2d708a9d0e9b0054856f07ac7070fbe1754be40ca8525d5adccdbda8f475"},
-    {file = "MarkupSafe-2.1.4-cp312-cp312-win_amd64.whl", hash = "sha256:5045e892cfdaecc5b4c01822f353cf2c8feb88a6ec1c0adef2a2e705eef0f656"},
-    {file = "MarkupSafe-2.1.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7a07f40ef8f0fbc5ef1000d0c78771f4d5ca03b4953fc162749772916b298fc4"},
-    {file = "MarkupSafe-2.1.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d18b66fe626ac412d96c2ab536306c736c66cf2a31c243a45025156cc190dc8a"},
-    {file = "MarkupSafe-2.1.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:698e84142f3f884114ea8cf83e7a67ca8f4ace8454e78fe960646c6c91c63bfa"},
-    {file = "MarkupSafe-2.1.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:49a3b78a5af63ec10d8604180380c13dcd870aba7928c1fe04e881d5c792dc4e"},
-    {file = "MarkupSafe-2.1.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:15866d7f2dc60cfdde12ebb4e75e41be862348b4728300c36cdf405e258415ec"},
-    {file = "MarkupSafe-2.1.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:6aa5e2e7fc9bc042ae82d8b79d795b9a62bd8f15ba1e7594e3db243f158b5565"},
-    {file = "MarkupSafe-2.1.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:54635102ba3cf5da26eb6f96c4b8c53af8a9c0d97b64bdcb592596a6255d8518"},
-    {file = "MarkupSafe-2.1.4-cp37-cp37m-win32.whl", hash = "sha256:3583a3a3ab7958e354dc1d25be74aee6228938312ee875a22330c4dc2e41beb0"},
-    {file = "MarkupSafe-2.1.4-cp37-cp37m-win_amd64.whl", hash = "sha256:d6e427c7378c7f1b2bef6a344c925b8b63623d3321c09a237b7cc0e77dd98ceb"},
-    {file = "MarkupSafe-2.1.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:bf1196dcc239e608605b716e7b166eb5faf4bc192f8a44b81e85251e62584bd2"},
-    {file = "MarkupSafe-2.1.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4df98d4a9cd6a88d6a585852f56f2155c9cdb6aec78361a19f938810aa020954"},
-    {file = "MarkupSafe-2.1.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b835aba863195269ea358cecc21b400276747cc977492319fd7682b8cd2c253d"},
-    {file = "MarkupSafe-2.1.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23984d1bdae01bee794267424af55eef4dfc038dc5d1272860669b2aa025c9e3"},
-    {file = "MarkupSafe-2.1.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c98c33ffe20e9a489145d97070a435ea0679fddaabcafe19982fe9c971987d5"},
-    {file = "MarkupSafe-2.1.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9896fca4a8eb246defc8b2a7ac77ef7553b638e04fbf170bff78a40fa8a91474"},
-    {file = "MarkupSafe-2.1.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:b0fe73bac2fed83839dbdbe6da84ae2a31c11cfc1c777a40dbd8ac8a6ed1560f"},
-    {file = "MarkupSafe-2.1.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c7556bafeaa0a50e2fe7dc86e0382dea349ebcad8f010d5a7dc6ba568eaaa789"},
-    {file = "MarkupSafe-2.1.4-cp38-cp38-win32.whl", hash = "sha256:fc1a75aa8f11b87910ffd98de62b29d6520b6d6e8a3de69a70ca34dea85d2a8a"},
-    {file = "MarkupSafe-2.1.4-cp38-cp38-win_amd64.whl", hash = "sha256:3a66c36a3864df95e4f62f9167c734b3b1192cb0851b43d7cc08040c074c6279"},
-    {file = "MarkupSafe-2.1.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:765f036a3d00395a326df2835d8f86b637dbaf9832f90f5d196c3b8a7a5080cb"},
-    {file = "MarkupSafe-2.1.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:21e7af8091007bf4bebf4521184f4880a6acab8df0df52ef9e513d8e5db23411"},
-    {file = "MarkupSafe-2.1.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5c31fe855c77cad679b302aabc42d724ed87c043b1432d457f4976add1c2c3e"},
-    {file = "MarkupSafe-2.1.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7653fa39578957bc42e5ebc15cf4361d9e0ee4b702d7d5ec96cdac860953c5b4"},
-    {file = "MarkupSafe-2.1.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:47bb5f0142b8b64ed1399b6b60f700a580335c8e1c57f2f15587bd072012decc"},
-    {file = "MarkupSafe-2.1.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:fe8512ed897d5daf089e5bd010c3dc03bb1bdae00b35588c49b98268d4a01e00"},
-    {file = "MarkupSafe-2.1.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:36d7626a8cca4d34216875aee5a1d3d654bb3dac201c1c003d182283e3205949"},
-    {file = "MarkupSafe-2.1.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b6f14a9cd50c3cb100eb94b3273131c80d102e19bb20253ac7bd7336118a673a"},
-    {file = "MarkupSafe-2.1.4-cp39-cp39-win32.whl", hash = "sha256:c8f253a84dbd2c63c19590fa86a032ef3d8cc18923b8049d91bcdeeb2581fbf6"},
-    {file = "MarkupSafe-2.1.4-cp39-cp39-win_amd64.whl", hash = "sha256:8b570a1537367b52396e53325769608f2a687ec9a4363647af1cded8928af959"},
-    {file = "MarkupSafe-2.1.4.tar.gz", hash = "sha256:3aae9af4cac263007fd6309c64c6ab4506dd2b79382d9d19a1994f9240b8db4f"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-win32.whl", hash = "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5"},
+    {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"},
 ]
 
 [[package]]
@@ -755,36 +752,36 @@ test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"]
 
 [[package]]
 name = "numba"
-version = "0.58.1"
+version = "0.59.0"
 description = "compiling Python code using LLVM"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "numba-0.58.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:07f2fa7e7144aa6f275f27260e73ce0d808d3c62b30cff8906ad1dec12d87bbe"},
-    {file = "numba-0.58.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7bf1ddd4f7b9c2306de0384bf3854cac3edd7b4d8dffae2ec1b925e4c436233f"},
-    {file = "numba-0.58.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bc2d904d0319d7a5857bd65062340bed627f5bfe9ae4a495aef342f072880d50"},
-    {file = "numba-0.58.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4e79b6cc0d2bf064a955934a2e02bf676bc7995ab2db929dbbc62e4c16551be6"},
-    {file = "numba-0.58.1-cp310-cp310-win_amd64.whl", hash = "sha256:81fe5b51532478149b5081311b0fd4206959174e660c372b94ed5364cfb37c82"},
-    {file = "numba-0.58.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bcecd3fb9df36554b342140a4d77d938a549be635d64caf8bd9ef6c47a47f8aa"},
-    {file = "numba-0.58.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a1eaa744f518bbd60e1f7ccddfb8002b3d06bd865b94a5d7eac25028efe0e0ff"},
-    {file = "numba-0.58.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bf68df9c307fb0aa81cacd33faccd6e419496fdc621e83f1efce35cdc5e79cac"},
-    {file = "numba-0.58.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:55a01e1881120e86d54efdff1be08381886fe9f04fc3006af309c602a72bc44d"},
-    {file = "numba-0.58.1-cp311-cp311-win_amd64.whl", hash = "sha256:811305d5dc40ae43c3ace5b192c670c358a89a4d2ae4f86d1665003798ea7a1a"},
-    {file = "numba-0.58.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ea5bfcf7d641d351c6a80e8e1826eb4a145d619870016eeaf20bbd71ef5caa22"},
-    {file = "numba-0.58.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e63d6aacaae1ba4ef3695f1c2122b30fa3d8ba039c8f517784668075856d79e2"},
-    {file = "numba-0.58.1-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6fe7a9d8e3bd996fbe5eac0683227ccef26cba98dae6e5cee2c1894d4b9f16c1"},
-    {file = "numba-0.58.1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:898af055b03f09d33a587e9425500e5be84fc90cd2f80b3fb71c6a4a17a7e354"},
-    {file = "numba-0.58.1-cp38-cp38-win_amd64.whl", hash = "sha256:d3e2fe81fe9a59fcd99cc572002101119059d64d31eb6324995ee8b0f144a306"},
-    {file = "numba-0.58.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5c765aef472a9406a97ea9782116335ad4f9ef5c9f93fc05fd44aab0db486954"},
-    {file = "numba-0.58.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9e9356e943617f5e35a74bf56ff6e7cc83e6b1865d5e13cee535d79bf2cae954"},
-    {file = "numba-0.58.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:240e7a1ae80eb6b14061dc91263b99dc8d6af9ea45d310751b780888097c1aaa"},
-    {file = "numba-0.58.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:45698b995914003f890ad839cfc909eeb9c74921849c712a05405d1a79c50f68"},
-    {file = "numba-0.58.1-cp39-cp39-win_amd64.whl", hash = "sha256:bd3dda77955be03ff366eebbfdb39919ce7c2620d86c906203bed92124989032"},
-    {file = "numba-0.58.1.tar.gz", hash = "sha256:487ded0633efccd9ca3a46364b40006dbdaca0f95e99b8b83e778d1195ebcbaa"},
+    {file = "numba-0.59.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8d061d800473fb8fef76a455221f4ad649a53f5e0f96e3f6c8b8553ee6fa98fa"},
+    {file = "numba-0.59.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c086a434e7d3891ce5dfd3d1e7ee8102ac1e733962098578b507864120559ceb"},
+    {file = "numba-0.59.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9e20736bf62e61f8353fb71b0d3a1efba636c7a303d511600fc57648b55823ed"},
+    {file = "numba-0.59.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e86e6786aec31d2002122199486e10bbc0dc40f78d76364cded375912b13614c"},
+    {file = "numba-0.59.0-cp310-cp310-win_amd64.whl", hash = "sha256:0307ee91b24500bb7e64d8a109848baf3a3905df48ce142b8ac60aaa406a0400"},
+    {file = "numba-0.59.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d540f69a8245fb714419c2209e9af6104e568eb97623adc8943642e61f5d6d8e"},
+    {file = "numba-0.59.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1192d6b2906bf3ff72b1d97458724d98860ab86a91abdd4cfd9328432b661e31"},
+    {file = "numba-0.59.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:90efb436d3413809fcd15298c6d395cb7d98184350472588356ccf19db9e37c8"},
+    {file = "numba-0.59.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cd3dac45e25d927dcb65d44fb3a973994f5add2b15add13337844afe669dd1ba"},
+    {file = "numba-0.59.0-cp311-cp311-win_amd64.whl", hash = "sha256:753dc601a159861808cc3207bad5c17724d3b69552fd22768fddbf302a817a4c"},
+    {file = "numba-0.59.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ce62bc0e6dd5264e7ff7f34f41786889fa81a6b860662f824aa7532537a7bee0"},
+    {file = "numba-0.59.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8cbef55b73741b5eea2dbaf1b0590b14977ca95a13a07d200b794f8f6833a01c"},
+    {file = "numba-0.59.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:70d26ba589f764be45ea8c272caa467dbe882b9676f6749fe6f42678091f5f21"},
+    {file = "numba-0.59.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e125f7d69968118c28ec0eed9fbedd75440e64214b8d2eac033c22c04db48492"},
+    {file = "numba-0.59.0-cp312-cp312-win_amd64.whl", hash = "sha256:4981659220b61a03c1e557654027d271f56f3087448967a55c79a0e5f926de62"},
+    {file = "numba-0.59.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fe4d7562d1eed754a7511ed7ba962067f198f86909741c5c6e18c4f1819b1f47"},
+    {file = "numba-0.59.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6feb1504bb432280f900deaf4b1dadcee68812209500ed3f81c375cbceab24dc"},
+    {file = "numba-0.59.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:944faad25ee23ea9dda582bfb0189fb9f4fc232359a80ab2a028b94c14ce2b1d"},
+    {file = "numba-0.59.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5516a469514bfae52a9d7989db4940653a5cbfac106f44cb9c50133b7ad6224b"},
+    {file = "numba-0.59.0-cp39-cp39-win_amd64.whl", hash = "sha256:32bd0a41525ec0b1b853da244808f4e5333867df3c43c30c33f89cf20b9c2b63"},
+    {file = "numba-0.59.0.tar.gz", hash = "sha256:12b9b064a3e4ad00e2371fc5212ef0396c80f41caec9b5ec391c8b04b6eaf2a8"},
 ]
 
 [package.dependencies]
-llvmlite = "==0.41.*"
+llvmlite = "==0.42.*"
 numpy = ">=1.22,<1.27"
 
 [[package]]
@@ -943,12 +940,12 @@ nvidia-nvjitlink-cu12 = "*"
 
 [[package]]
 name = "nvidia-nccl-cu12"
-version = "2.18.1"
+version = "2.19.3"
 description = "NVIDIA Collective Communication Library (NCCL) Runtime"
 optional = false
 python-versions = ">=3"
 files = [
-    {file = "nvidia_nccl_cu12-2.18.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:1a6c4acefcbebfa6de320f412bf7866de856e786e0462326ba1bac40de0b5e71"},
+    {file = "nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl", hash = "sha256:a9734707a2c96443331c1e48c717024aa6678a0e2a4cb66b2c364d18cee6b48d"},
 ]
 
 [[package]]
@@ -1036,35 +1033,36 @@ dev = ["Pillow", "black", "googledrivedownloader", "isort", "onnxruntime", "pre-
 
 [[package]]
 name = "onnxruntime"
-version = "1.16.3"
+version = "1.17.0"
 description = "ONNX Runtime is a runtime accelerator for Machine Learning models"
 optional = true
 python-versions = "*"
 files = [
-    {file = "onnxruntime-1.16.3-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:3bc41f323ac77acfed190be8ffdc47a6a75e4beeb3473fbf55eeb075ccca8df2"},
-    {file = "onnxruntime-1.16.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:212741b519ee61a4822c79c47147d63a8b0ffde25cd33988d3d7be9fbd51005d"},
-    {file = "onnxruntime-1.16.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f91f5497fe3df4ceee2f9e66c6148d9bfeb320cd6a71df361c66c5b8bac985a"},
-    {file = "onnxruntime-1.16.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef2b1fc269cabd27f129fb9058917d6fdc89b188c49ed8700f300b945c81f889"},
-    {file = "onnxruntime-1.16.3-cp310-cp310-win32.whl", hash = "sha256:f36b56a593b49a3c430be008c2aea6658d91a3030115729609ec1d5ffbaab1b6"},
-    {file = "onnxruntime-1.16.3-cp310-cp310-win_amd64.whl", hash = "sha256:3c467eaa3d2429c026b10c3d17b78b7f311f718ef9d2a0d6938e5c3c2611b0cf"},
-    {file = "onnxruntime-1.16.3-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:a225bb683991001d111f75323d355b3590e75e16b5e0f07a0401e741a0143ea1"},
-    {file = "onnxruntime-1.16.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9aded21fe3d898edd86be8aa2eb995aa375e800ad3dfe4be9f618a20b8ee3630"},
-    {file = "onnxruntime-1.16.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00cccc37a5195c8fca5011b9690b349db435986bd508eb44c9fce432da9228a4"},
-    {file = "onnxruntime-1.16.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e253e572021563226a86f1c024f8f70cdae28f2fb1cc8c3a9221e8b1ce37db5"},
-    {file = "onnxruntime-1.16.3-cp311-cp311-win32.whl", hash = "sha256:a82a8f0b4c978d08f9f5c7a6019ae51151bced9fd91e5aaa0c20a9e4ac7a60b6"},
-    {file = "onnxruntime-1.16.3-cp311-cp311-win_amd64.whl", hash = "sha256:78d81d9af457a1dc90db9a7da0d09f3ccb1288ea1236c6ab19f0ca61f3eee2d3"},
-    {file = "onnxruntime-1.16.3-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:04ebcd29c20473596a1412e471524b2fb88d55e6301c40b98dd2407b5911595f"},
-    {file = "onnxruntime-1.16.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9996bab0f202a6435ab867bc55598f15210d0b72794d5de83712b53d564084ae"},
-    {file = "onnxruntime-1.16.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b8f5083f903408238883821dd8c775f8120cb4a604166dbdabe97f4715256d5"},
-    {file = "onnxruntime-1.16.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c2dcf1b70f8434abb1116fe0975c00e740722aaf321997195ea3618cc00558e"},
-    {file = "onnxruntime-1.16.3-cp38-cp38-win32.whl", hash = "sha256:d4a0151e1accd04da6711f6fd89024509602f82c65a754498e960b032359b02d"},
-    {file = "onnxruntime-1.16.3-cp38-cp38-win_amd64.whl", hash = "sha256:e8aa5bba78afbd4d8a2654b14ec7462ff3ce4a6aad312a3c2d2c2b65009f2541"},
-    {file = "onnxruntime-1.16.3-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:6829dc2a79d48c911fedaf4c0f01e03c86297d32718a3fdee7a282766dfd282a"},
-    {file = "onnxruntime-1.16.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:76f876c53bfa912c6c242fc38213a6f13f47612d4360bc9d599bd23753e53161"},
-    {file = "onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4137e5d443e2dccebe5e156a47f1d6d66f8077b03587c35f11ee0c7eda98b533"},
-    {file = "onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c56695c1a343c7c008b647fff3df44da63741fbe7b6003ef576758640719be7b"},
-    {file = "onnxruntime-1.16.3-cp39-cp39-win32.whl", hash = "sha256:985a029798744ce4743fcf8442240fed35c8e4d4d30ec7d0c2cdf1388cd44408"},
-    {file = "onnxruntime-1.16.3-cp39-cp39-win_amd64.whl", hash = "sha256:28ff758b17ce3ca6bcad3d936ec53bd7f5482e7630a13f6dcae518eba8f71d85"},
+    {file = "onnxruntime-1.17.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:d2b22a25a94109cc983443116da8d9805ced0256eb215c5e6bc6dcbabefeab96"},
+    {file = "onnxruntime-1.17.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b4c87d83c6f58d1af2675fc99e3dc810f2dbdb844bcefd0c1b7573632661f6fc"},
+    {file = "onnxruntime-1.17.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dba55723bf9b835e358f48c98a814b41692c393eb11f51e02ece0625c756b797"},
+    {file = "onnxruntime-1.17.0-cp310-cp310-win32.whl", hash = "sha256:ee48422349cc500273beea7607e33c2237909f58468ae1d6cccfc4aecd158565"},
+    {file = "onnxruntime-1.17.0-cp310-cp310-win_amd64.whl", hash = "sha256:f34cc46553359293854e38bdae2ab1be59543aad78a6317e7746d30e311110c3"},
+    {file = "onnxruntime-1.17.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:16d26badd092c8c257fa57c458bb600d96dc15282c647ccad0ed7b2732e6c03b"},
+    {file = "onnxruntime-1.17.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6f1273bebcdb47ed932d076c85eb9488bc4768fcea16d5f2747ca692fad4f9d3"},
+    {file = "onnxruntime-1.17.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cb60fd3c2c1acd684752eb9680e89ae223e9801a9b0e0dc7b28adabe45a2e380"},
+    {file = "onnxruntime-1.17.0-cp311-cp311-win32.whl", hash = "sha256:4b038324586bc905299e435f7c00007e6242389c856b82fe9357fdc3b1ef2bdc"},
+    {file = "onnxruntime-1.17.0-cp311-cp311-win_amd64.whl", hash = "sha256:93d39b3fa1ee01f034f098e1c7769a811a21365b4883f05f96c14a2b60c6028b"},
+    {file = "onnxruntime-1.17.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:90c0890e36f880281c6c698d9bc3de2afbeee2f76512725ec043665c25c67d21"},
+    {file = "onnxruntime-1.17.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7466724e809a40e986b1637cba156ad9fc0d1952468bc00f79ef340bc0199552"},
+    {file = "onnxruntime-1.17.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d47bee7557a8b99c8681b6882657a515a4199778d6d5e24e924d2aafcef55b0a"},
+    {file = "onnxruntime-1.17.0-cp312-cp312-win32.whl", hash = "sha256:bb1bf1ee575c665b8bbc3813ab906e091a645a24ccc210be7932154b8260eca1"},
+    {file = "onnxruntime-1.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:ac2f286da3494b29b4186ca193c7d4e6a2c1f770c4184c7192c5da142c3dec28"},
+    {file = "onnxruntime-1.17.0-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:1ec485643b93e0a3896c655eb2426decd63e18a278bb7ccebc133b340723624f"},
+    {file = "onnxruntime-1.17.0-cp38-cp38-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:83c35809cda898c5a11911c69ceac8a2ac3925911854c526f73bad884582f911"},
+    {file = "onnxruntime-1.17.0-cp38-cp38-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fa464aa4d81df818375239e481887b656e261377d5b6b9a4692466f5f3261edc"},
+    {file = "onnxruntime-1.17.0-cp38-cp38-win32.whl", hash = "sha256:b7b337cd0586f7836601623cbd30a443df9528ef23965860d11c753ceeb009f2"},
+    {file = "onnxruntime-1.17.0-cp38-cp38-win_amd64.whl", hash = "sha256:fbb9faaf51d01aa2c147ef52524d9326744c852116d8005b9041809a71838878"},
+    {file = "onnxruntime-1.17.0-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:5a06ab84eaa350bf64b1d747b33ccf10da64221ed1f38f7287f15eccbec81603"},
+    {file = "onnxruntime-1.17.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d3d11db2c8242766212a68d0b139745157da7ce53bd96ba349a5c65e5a02357"},
+    {file = "onnxruntime-1.17.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5632077c3ab8b0cd4f74b0af9c4e924be012b1a7bcd7daa845763c6c6bf14b7d"},
+    {file = "onnxruntime-1.17.0-cp39-cp39-win32.whl", hash = "sha256:61a12732cba869b3ad2d4e29ab6cb62c7a96f61b8c213f7fcb961ba412b70b37"},
+    {file = "onnxruntime-1.17.0-cp39-cp39-win_amd64.whl", hash = "sha256:461fa0fc7d9c392c352b6cccdedf44d818430f3d6eacd924bb804fdea2dcfd02"},
 ]
 
 [package.dependencies]
@@ -1077,19 +1075,21 @@ sympy = "*"
 
 [[package]]
 name = "onnxruntime-gpu"
-version = "1.16.3"
+version = "1.17.0"
 description = "ONNX Runtime is a runtime accelerator for Machine Learning models"
 optional = true
 python-versions = "*"
 files = [
-    {file = "onnxruntime_gpu-1.16.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c14bc735ad2b2286be9eadeea09bc190df38e8bce17e37b601761019cc7cc24f"},
-    {file = "onnxruntime_gpu-1.16.3-cp310-cp310-win_amd64.whl", hash = "sha256:8de5ccfc005ea5ec50fbd104b7210c97623a9f8c13de6e64ce559b55956b757f"},
-    {file = "onnxruntime_gpu-1.16.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5703454521a9c080ff3ac79b5d266e959cc735d442a1d8796763c7f92d6069dc"},
-    {file = "onnxruntime_gpu-1.16.3-cp311-cp311-win_amd64.whl", hash = "sha256:48bb615aed61f5620d1ad46b9005614e1a14de60f8218a1448cc9a643f23d399"},
-    {file = "onnxruntime_gpu-1.16.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2811c8ea209aaedcc2600ca828025279c1b1242344af603122d28c2ea8ab26a4"},
-    {file = "onnxruntime_gpu-1.16.3-cp38-cp38-win_amd64.whl", hash = "sha256:2e5a92770c9232776739f378804bf6fea20bae02878a50b7fe0f81e77a47ee92"},
-    {file = "onnxruntime_gpu-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9305c7fc5981d7e04ad2afef1a403475fb84d658898567c91aa5a41c20ead356"},
-    {file = "onnxruntime_gpu-1.16.3-cp39-cp39-win_amd64.whl", hash = "sha256:d3ad8e7fbb22493267c23d61e997a6b2ac6236a08aa6b58a3a91848124c9b037"},
+    {file = "onnxruntime_gpu-1.17.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:1f2a4e0468ac0bd8246996c3d5dbba92cbbaca874bcd7f9cee4e99ce6eb27f5b"},
+    {file = "onnxruntime_gpu-1.17.0-cp310-cp310-win_amd64.whl", hash = "sha256:0721b7930d7abed3730b2335e639e60d94ec411bb4d35a0347cc9c8b52c34540"},
+    {file = "onnxruntime_gpu-1.17.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:be0314afe399943904de7c1ca797cbcc63e6fad60eb85d3df6422f81dd94e79e"},
+    {file = "onnxruntime_gpu-1.17.0-cp311-cp311-win_amd64.whl", hash = "sha256:52125c24b21406d1431e43de1c98cea29c21e0cceba80db530b7e4c9216d86ea"},
+    {file = "onnxruntime_gpu-1.17.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:bb802d8033885c412269f8bc8877d8779b0dc874df6fb9df8b796cba7276ad66"},
+    {file = "onnxruntime_gpu-1.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:8c43533e3e5335eaa78059fb86b849a4faded513a00c1feaaa205ca5af51c40f"},
+    {file = "onnxruntime_gpu-1.17.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:1d461455bba160836d6c11c648c8fd4e4500d5c17096a13e6c2c9d22a4abd436"},
+    {file = "onnxruntime_gpu-1.17.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4398f2175a92f4b35d95279a6294a89c462f24de058a2736ee1d498bab5a16"},
+    {file = "onnxruntime_gpu-1.17.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1d0e3805cd1c024aba7f4ae576fd08545fc27530a2aaad2b3c8ac0ee889fbd05"},
+    {file = "onnxruntime_gpu-1.17.0-cp39-cp39-win_amd64.whl", hash = "sha256:fc1da5b93363ee600b5b220b04eeec51ad2c2b3e96f0b7615b16b8a173c88001"},
 ]
 
 [package.dependencies]
@@ -1230,18 +1230,18 @@ xmp = ["defusedxml"]
 
 [[package]]
 name = "platformdirs"
-version = "4.1.0"
+version = "4.2.0"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "platformdirs-4.1.0-py3-none-any.whl", hash = "sha256:11c8f37bcca40db96d8144522d925583bdb7a31f7b0e37e3ed4318400a8e2380"},
-    {file = "platformdirs-4.1.0.tar.gz", hash = "sha256:906d548203468492d432bcb294d4bc2fff751bf84971fbb2c10918cc206ee420"},
+    {file = "platformdirs-4.2.0-py3-none-any.whl", hash = "sha256:0614df2a2f37e1a662acbd8e2b25b92ccf8632929bc6d43467e17fe89c75e068"},
+    {file = "platformdirs-4.2.0.tar.gz", hash = "sha256:ef0cc731df711022c174543cb70a9b5bd22e5a9337c8624ef2c2ceb8ddad8768"},
 ]
 
 [package.extras]
-docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"]
-test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"]
+docs = ["furo (>=2023.9.10)", "proselint (>=0.13)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"]
+test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)"]
 
 [[package]]
 name = "pluggy"
@@ -1615,31 +1615,36 @@ files = [
 
 [[package]]
 name = "torch"
-version = "2.1.2"
+version = "2.2.0"
 description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:3a871edd6c02dae77ad810335c0833391c1a4ce49af21ea8cf0f6a5d2096eea8"},
-    {file = "torch-2.1.2-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:bef6996c27d8f6e92ea4e13a772d89611da0e103b48790de78131e308cf73076"},
-    {file = "torch-2.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:0e13034fd5fb323cbbc29e56d0637a3791e50dd589616f40c79adfa36a5a35a1"},
-    {file = "torch-2.1.2-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:d9b535cad0df3d13997dbe8bd68ac33e0e3ae5377639c9881948e40794a61403"},
-    {file = "torch-2.1.2-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:f9a55d55af02826ebfbadf4e9b682f0f27766bc33df8236b48d28d705587868f"},
-    {file = "torch-2.1.2-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:a6ebbe517097ef289cc7952783588c72de071d4b15ce0f8b285093f0916b1162"},
-    {file = "torch-2.1.2-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:8f32ce591616a30304f37a7d5ea80b69ca9e1b94bba7f308184bf616fdaea155"},
-    {file = "torch-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:e0ee6cf90c8970e05760f898d58f9ac65821c37ffe8b04269ec787aa70962b69"},
-    {file = "torch-2.1.2-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:76d37967c31c99548ad2c4d3f2cf191db48476f2e69b35a0937137116da356a1"},
-    {file = "torch-2.1.2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:e2d83f07b4aac983453ea5bf8f9aa9dacf2278a8d31247f5d9037f37befc60e4"},
-    {file = "torch-2.1.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:f41fe0c7ecbf903a568c73486139a75cfab287a0f6c17ed0698fdea7a1e8641d"},
-    {file = "torch-2.1.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:e3225f47d50bb66f756fe9196a768055d1c26b02154eb1f770ce47a2578d3aa7"},
-    {file = "torch-2.1.2-cp38-cp38-win_amd64.whl", hash = "sha256:33d59cd03cb60106857f6c26b36457793637512998666ee3ce17311f217afe2b"},
-    {file = "torch-2.1.2-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:8e221deccd0def6c2badff6be403e0c53491805ed9915e2c029adbcdb87ab6b5"},
-    {file = "torch-2.1.2-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:05b18594f60a911a0c4f023f38a8bda77131fba5fd741bda626e97dcf5a3dd0a"},
-    {file = "torch-2.1.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:9ca96253b761e9aaf8e06fb30a66ee301aecbf15bb5a303097de1969077620b6"},
-    {file = "torch-2.1.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:d93ba70f67b08c2ae5598ee711cbc546a1bc8102cef938904b8c85c2089a51a0"},
-    {file = "torch-2.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:255b50bc0608db177e6a3cc118961d77de7e5105f07816585fa6f191f33a9ff3"},
-    {file = "torch-2.1.2-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:6984cd5057c0c977b3c9757254e989d3f1124f4ce9d07caa6cb637783c71d42a"},
-    {file = "torch-2.1.2-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:bc195d7927feabc0eb7c110e457c955ed2ab616f3c7c28439dd4188cf589699f"},
+    {file = "torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:d366158d6503a3447e67f8c0ad1328d54e6c181d88572d688a625fac61b13a97"},
+    {file = "torch-2.2.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:707f2f80402981e9f90d0038d7d481678586251e6642a7a6ef67fc93511cb446"},
+    {file = "torch-2.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:15c8f0a105c66b28496092fca1520346082e734095f8eaf47b5786bac24b8a31"},
+    {file = "torch-2.2.0-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:0ca4df4b728515ad009b79f5107b00bcb2c63dc202d991412b9eb3b6a4f24349"},
+    {file = "torch-2.2.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:3d3eea2d5969b9a1c9401429ca79efc668120314d443d3463edc3289d7f003c7"},
+    {file = "torch-2.2.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:0d1c580e379c0d48f0f0a08ea28d8e373295aa254de4f9ad0631f9ed8bc04c24"},
+    {file = "torch-2.2.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:9328e3c1ce628a281d2707526b4d1080eae7c4afab4f81cea75bde1f9441dc78"},
+    {file = "torch-2.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:03c8e660907ac1b8ee07f6d929c4e15cd95be2fb764368799cca02c725a212b8"},
+    {file = "torch-2.2.0-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:da0cefe7f84ece3e3b56c11c773b59d1cb2c0fd83ddf6b5f7f1fd1a987b15c3e"},
+    {file = "torch-2.2.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:f81d23227034221a4a4ff8ef24cc6cec7901edd98d9e64e32822778ff01be85e"},
+    {file = "torch-2.2.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:dcbfb2192ac41ca93c756ebe9e2af29df0a4c14ee0e7a0dd78f82c67a63d91d4"},
+    {file = "torch-2.2.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:9eeb42971619e24392c9088b5b6d387d896e267889d41d267b1fec334f5227c5"},
+    {file = "torch-2.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:c718b2ca69a6cac28baa36d86d8c0ec708b102cebd1ceb1b6488e404cd9be1d1"},
+    {file = "torch-2.2.0-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:f11d18fceb4f9ecb1ac680dde7c463c120ed29056225d75469c19637e9f98d12"},
+    {file = "torch-2.2.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:ee1da852bfd4a7e674135a446d6074c2da7194c1b08549e31eae0b3138c6b4d2"},
+    {file = "torch-2.2.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:0d819399819d0862268ac531cf12a501c253007df4f9e6709ede8a0148f1a7b8"},
+    {file = "torch-2.2.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:08f53ccc38c49d839bc703ea1b20769cc8a429e0c4b20b56921a9f64949bf325"},
+    {file = "torch-2.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:93bffe3779965a71dab25fc29787538c37c5d54298fd2f2369e372b6fb137d41"},
+    {file = "torch-2.2.0-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:c17ec323da778efe8dad49d8fb534381479ca37af1bfc58efdbb8607a9d263a3"},
+    {file = "torch-2.2.0-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:c02685118008834e878f676f81eab3a952b7936fa31f474ef8a5ff4b5c78b36d"},
+    {file = "torch-2.2.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:d9f39d6f53cec240a0e3baa82cb697593340f9d4554cee6d3d6ca07925c2fac0"},
+    {file = "torch-2.2.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:51770c065206250dc1222ea7c0eff3f88ab317d3e931cca2aee461b85fbc2472"},
+    {file = "torch-2.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:008e4c6ad703de55af760c73bf937ecdd61a109f9b08f2bbb9c17e7c7017f194"},
+    {file = "torch-2.2.0-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:de8680472dd14e316f42ceef2a18a301461a9058cd6e99a1f1b20f78f11412f1"},
+    {file = "torch-2.2.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:99e1dcecb488e3fd25bcaac56e48cdb3539842904bdc8588b0b255fde03a254c"},
 ]
 
 [package.dependencies]
@@ -1656,78 +1661,101 @@ nvidia-cufft-cu12 = {version = "11.0.2.54", markers = "platform_system == \"Linu
 nvidia-curand-cu12 = {version = "10.3.2.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-cusolver-cu12 = {version = "11.4.5.107", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-cusparse-cu12 = {version = "12.1.0.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-nccl-cu12 = {version = "2.18.1", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-nccl-cu12 = {version = "2.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-nvtx-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 sympy = "*"
-triton = {version = "2.1.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-typing-extensions = "*"
+triton = {version = "2.2.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+typing-extensions = ">=4.8.0"
 
 [package.extras]
-dynamo = ["jinja2"]
 opt-einsum = ["opt-einsum (>=3.3)"]
+optree = ["optree (>=0.9.1)"]
 
 [[package]]
 name = "torchvision"
-version = "0.16.2"
+version = "0.17.0"
 description = "image and video datasets and models for torch deep learning"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "torchvision-0.16.2-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:bc86f2800cb2c0c1a09c581409cdd6bff66e62f103dc83fc63f73346264c3756"},
-    {file = "torchvision-0.16.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b024bd412df6d3a007dcebf311a894eb3c5c21e1af80d12be382bbcb097a7c3a"},
-    {file = "torchvision-0.16.2-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:e89f10f3c8351972b6e3fda95bc3e479ea8dbfc9dfcfd2c32902dbad4ba5cfc5"},
-    {file = "torchvision-0.16.2-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:96c7583700112a410bdc4e1e4f118c429dab49c29c9a31a2cc3579bc9b08b19d"},
-    {file = "torchvision-0.16.2-cp310-cp310-win_amd64.whl", hash = "sha256:9f4032ebb3277fb07ff6a9b818d50a547fb8fcd89d958cfd9e773322454bb688"},
-    {file = "torchvision-0.16.2-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:67b1aaf8b8cb02ce75dd445f291a27c8036a502f8c0aa76e28c37a0faac2e153"},
-    {file = "torchvision-0.16.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bef30d03e1d1c629761f4dca51d3b7d8a0dc0acce6f4068ab2a1634e8e7b64e0"},
-    {file = "torchvision-0.16.2-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:e59cc7b2bd1ab5c0ce4ae382e4e37be8f1c174e8b5de2f6a23c170de9ae28495"},
-    {file = "torchvision-0.16.2-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:e130b08cc9b3cc73a6c59d6edf032394a322f9579bfd21d14bc2e1d0999aa758"},
-    {file = "torchvision-0.16.2-cp311-cp311-win_amd64.whl", hash = "sha256:8692ab1e48807e9604046a6f4beeb67b523294cee1b00828654bb0df2cfce2b2"},
-    {file = "torchvision-0.16.2-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:b82732dcf876a37c852772342aa6ee3480c03bb3e2a802ae109fc5f7e28d26e9"},
-    {file = "torchvision-0.16.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4b065143d1a720fe8a9077fd4be35d491f98819ec80b3dbbc3ec64d0b707a906"},
-    {file = "torchvision-0.16.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:bc5f274e4ecd1b86062063cdf4fd385a1d39d147a3a2685fbbde9ff08bb720b8"},
-    {file = "torchvision-0.16.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:335959c43b371c0474af34c1ef2a52efdc7603c45700d29e4475eeb02984170c"},
-    {file = "torchvision-0.16.2-cp38-cp38-win_amd64.whl", hash = "sha256:7fd22d86e08eba321af70cad291020c2cdeac069b00ce88b923ca52e06174769"},
-    {file = "torchvision-0.16.2-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:56115268b37f0b75364e3654e47ad9abc66ac34c1f9e5e3dfa89a22d6a40017a"},
-    {file = "torchvision-0.16.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:82805f8445b094f9d1e770390ee6cc86855e89955e08ce34af2e2274fc0e5c45"},
-    {file = "torchvision-0.16.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:3f4bd5fcbc361476e2e78016636ac7d5509e59d9962521f06eb98e6803898182"},
-    {file = "torchvision-0.16.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:8199acdf8ab066a28b84a5b6f4d97b58976d9e164b1acc3a9d14fccfaf74bb3a"},
-    {file = "torchvision-0.16.2-cp39-cp39-win_amd64.whl", hash = "sha256:41dd4fa9f176d563fe9f1b9adef3b7e582cdfb60ce8c9bc51b094a025be687c9"},
+    {file = "torchvision-0.17.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:153882cd8ff8e3dbef5c5054fdd15df64e85420546805a90c0b2221f2f119c4a"},
+    {file = "torchvision-0.17.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c55c2f86e3f3a21ddd92739a972366244e9b17916e836ec47167b0a0c083c65f"},
+    {file = "torchvision-0.17.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:605950cdcefe6c5aef85709ade17b1525bcf171e122cce1df09e666d96525b90"},
+    {file = "torchvision-0.17.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:3d86c212fc6379e9bec3ac647d062e34c2cf36c26b98840b66573eb9fbe1f1d9"},
+    {file = "torchvision-0.17.0-cp310-cp310-win_amd64.whl", hash = "sha256:71b314813faf13cecb09a4a635b5e4b274e8df0b1921681038d491c529555bb6"},
+    {file = "torchvision-0.17.0-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:10d276821f115fb369e6cf1f1b77b2cca60cda12cbb39a41513a9d3d0f2a93ae"},
+    {file = "torchvision-0.17.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3eef2daddadb5c21e802e0550dd7e3ee3d98c430f4aed212ae3ba0358558be1"},
+    {file = "torchvision-0.17.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:acc0d098ab8c295a750f0218bf5bf7bfc2f2c21f9c2fe3fc30b695cd94f4c759"},
+    {file = "torchvision-0.17.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:3d2e9552d72e4037f2db6f7d97989a2e2f95763aa1861963a3faf521bb1610c4"},
+    {file = "torchvision-0.17.0-cp311-cp311-win_amd64.whl", hash = "sha256:f8e542cf71e1294fcb5635038eae6702df543dc90706f0836ec80e75efc511fc"},
+    {file = "torchvision-0.17.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:816ae1a4506b1cb0f638e1827cae7ab768c731369ab23e86839f177926197143"},
+    {file = "torchvision-0.17.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:be39874c239215a39b3c431c7016501f1a45bfbbebf2fe8e11d8339b5ea23bca"},
+    {file = "torchvision-0.17.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:8fe14d580557aef2c45dd462c069ff936b6507b215c4b496f30973ae8cff917d"},
+    {file = "torchvision-0.17.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:4608ba3246c45c968ede40e7640e4eed64556210faa154cf1ffccb1cadabe445"},
+    {file = "torchvision-0.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:b755d6d3e021239d2408bf3794d0d3dcffbc629f1fd808c43d8b346045a098c4"},
+    {file = "torchvision-0.17.0-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:870d7cda57420e44d20eb07bfe37bf5344a06434a7a6195b4c7f3dd55838587d"},
+    {file = "torchvision-0.17.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:477f6e64a9d798c0f5adefc300acc220da6f17ef5c1e110d20108f66554fee4d"},
+    {file = "torchvision-0.17.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:a54a15bd6f3dbb04ebd36c5a87530b2e090ee4b9b15eb89eda558ab3e50396a0"},
+    {file = "torchvision-0.17.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:e041ce3336364413bab051a3966d884bab25c200f98ca8a065f0abe758c3005e"},
+    {file = "torchvision-0.17.0-cp38-cp38-win_amd64.whl", hash = "sha256:7887f767670c72aa20f5237042d0ca1462da18f66a3ea8c36b6ba67ce26b82fc"},
+    {file = "torchvision-0.17.0-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:b1ced438b81ef662a71c8c81debaf0c80455b35b811ca55a4c3c593d721b560a"},
+    {file = "torchvision-0.17.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b53569c52bd4bd1176a1e49d8ea55883bcf57e1614cb97e2e8ce372768299b70"},
+    {file = "torchvision-0.17.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:7f373507afcd9022ebd9f50b31da8dbac1ea6783ffb77d1f1ab8806425c0a83b"},
+    {file = "torchvision-0.17.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:085251ab36340206dc7e1be59a15fa5e307d45ccd66889f5d7bf1ba5e7ecdc57"},
+    {file = "torchvision-0.17.0-cp39-cp39-win_amd64.whl", hash = "sha256:4c0d4c0af58af2752aad235150bd794d0f324e6eeac5cd13c440bda5dce622d3"},
 ]
 
 [package.dependencies]
 numpy = "*"
 pillow = ">=5.3.0,<8.3.dev0 || >=8.4.dev0"
 requests = "*"
-torch = "2.1.2"
+torch = "2.2.0"
 
 [package.extras]
 scipy = ["scipy"]
 
+[[package]]
+name = "tqdm"
+version = "4.66.1"
+description = "Fast, Extensible Progress Meter"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"},
+    {file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
+notebook = ["ipywidgets (>=6)"]
+slack = ["slack-sdk"]
+telegram = ["requests"]
+
 [[package]]
 name = "triton"
-version = "2.1.0"
+version = "2.2.0"
 description = "A language and compiler for custom Deep Learning operations"
 optional = false
 python-versions = "*"
 files = [
-    {file = "triton-2.1.0-0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:66439923a30d5d48399b08a9eae10370f6c261a5ec864a64983bae63152d39d7"},
-    {file = "triton-2.1.0-0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:919b06453f0033ea52c13eaf7833de0e57db3178d23d4e04f9fc71c4f2c32bf8"},
-    {file = "triton-2.1.0-0-cp37-cp37m-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ae4bb8a91de790e1866405211c4d618379781188f40d5c4c399766914e84cd94"},
-    {file = "triton-2.1.0-0-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:39f6fb6bdccb3e98f3152e3fbea724f1aeae7d749412bbb1fa9c441d474eba26"},
-    {file = "triton-2.1.0-0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:21544e522c02005a626c8ad63d39bdff2f31d41069592919ef281e964ed26446"},
-    {file = "triton-2.1.0-0-pp37-pypy37_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:143582ca31dd89cd982bd3bf53666bab1c7527d41e185f9e3d8a3051ce1b663b"},
-    {file = "triton-2.1.0-0-pp38-pypy38_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:82fc5aeeedf6e36be4e4530cbdcba81a09d65c18e02f52dc298696d45721f3bd"},
-    {file = "triton-2.1.0-0-pp39-pypy39_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:81a96d110a738ff63339fc892ded095b31bd0d205e3aace262af8400d40b6fa8"},
+    {file = "triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2294514340cfe4e8f4f9e5c66c702744c4a117d25e618bd08469d0bfed1e2e5"},
+    {file = "triton-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da58a152bddb62cafa9a857dd2bc1f886dbf9f9c90a2b5da82157cd2b34392b0"},
+    {file = "triton-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0af58716e721460a61886668b205963dc4d1e4ac20508cc3f623aef0d70283d5"},
+    {file = "triton-2.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8fe46d3ab94a8103e291bd44c741cc294b91d1d81c1a2888254cbf7ff846dab"},
+    {file = "triton-2.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8ce26093e539d727e7cf6f6f0d932b1ab0574dc02567e684377630d86723ace"},
+    {file = "triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:227cc6f357c5efcb357f3867ac2a8e7ecea2298cd4606a8ba1e931d1d5a947df"},
 ]
 
 [package.dependencies]
 filelock = "*"
 
 [package.extras]
-build = ["cmake (>=3.18)", "lit"]
-tests = ["autopep8", "flake8", "isort", "numpy", "pytest", "scipy (>=1.7.1)"]
-tutorials = ["matplotlib", "pandas", "tabulate"]
+build = ["cmake (>=3.20)", "lit"]
+tests = ["autopep8", "flake8", "isort", "numpy", "pytest", "scipy (>=1.7.1)", "torch"]
+tutorials = ["matplotlib", "pandas", "tabulate", "torch"]
 
 [[package]]
 name = "typing-extensions"
@@ -1742,17 +1770,18 @@ files = [
 
 [[package]]
 name = "urllib3"
-version = "2.1.0"
+version = "2.2.0"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "urllib3-2.1.0-py3-none-any.whl", hash = "sha256:55901e917a5896a349ff771be919f8bd99aff50b79fe58fec595eb37bbc56bb3"},
-    {file = "urllib3-2.1.0.tar.gz", hash = "sha256:df7aa8afb0148fa78488e7899b2c59b5f4ffcfa82e6c54ccb9dd37c1d7b52d54"},
+    {file = "urllib3-2.2.0-py3-none-any.whl", hash = "sha256:ce3711610ddce217e6d113a2732fafad960a03fd0318c91faa79481e35c11224"},
+    {file = "urllib3-2.2.0.tar.gz", hash = "sha256:051d961ad0c62a94e50ecf1af379c3aba230c66c710493493560c0c223c49f20"},
 ]
 
 [package.extras]
 brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
+h2 = ["h2 (>=4,<5)"]
 socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
 zstd = ["zstandard (>=0.18.0)"]
 
@@ -1764,4 +1793,4 @@ silicon = ["onnxruntime-silicon"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9"
-content-hash = "3939b79300af24dbd367bef1fde0327eaadd6051bf12db2f067c43305102f4a1"
+content-hash = "5880967d7da3b018ae3a7d221cf038c3c01e75e59c7cad081892895e5f189624"
diff --git a/pyproject.toml b/pyproject.toml
index 9aa33a9..a1540d5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "audio-separator"
-version = "0.13.1"
+version = "0.14.0"
 description = "Easy to use vocal separation, using MDX-Net models from UVR trained by @Anjok07"
 authors = ["Andrew Beveridge <andrew@beveridge.uk>"]
 license = "MIT"
diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py
index f28ceee..5ecf240 100644
--- a/tests/unit/test_cli.py
+++ b/tests/unit/test_cli.py
@@ -15,15 +15,13 @@ def common_expected_args():
         "model_file_dir": "/tmp/audio-separator-models/",
         "output_dir": None,
         "output_format": "FLAC",
-        "denoise_enabled": False,
+        "enable_denoise": False,
         "normalization_threshold": 0.9,
         "output_single_stem": None,
         "invert_using_spec": False,
         "sample_rate": 44100,
-        "hop_length": 1024,
-        "segment_size": 256,
-        "overlap": 0.25,
-        "batch_size": 1,
+        "mdx_params": {"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1},
+        "vr_params": {"batch_size": 4, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False},
     }
 
 
@@ -83,8 +81,8 @@ def test_cli_invalid_log_level():
 
 
 # Test using model name argument
-def test_cli_model_name_argument(common_expected_args):
-    test_args = ["cli.py", "test_audio.mp3", "--model_name=Custom_Model"]
+def test_cli_model_filename_argument(common_expected_args):
+    test_args = ["cli.py", "test_audio.mp3", "--model_filename=Custom_Model.onnx"]
     with patch("sys.argv", test_args):
         with patch("audio_separator.separator.Separator") as mock_separator:
             mock_separator_instance = mock_separator.return_value
@@ -93,7 +91,7 @@ def test_cli_model_name_argument(common_expected_args):
 
             # Assertions
             mock_separator.assert_called_once_with(**common_expected_args)
-            mock_separator_instance.load_model.assert_called_once_with("Custom_Model")
+            mock_separator_instance.load_model.assert_called_once_with("Custom_Model.onnx")
 
 
 # Test using output directory argument
@@ -138,7 +136,7 @@ def test_cli_denoise_argument(common_expected_args):
             main()
 
             # Update expected args for this specific test
-            common_expected_args["denoise_enabled"] = True
+            common_expected_args["enable_denoise"] = True
 
             # Assertions
             mock_separator.assert_called_once_with(**common_expected_args)
diff --git a/tests/unit/test_spec_utils.py b/tests/unit/test_spec_utils.py
deleted file mode 100644
index 4996f5b..0000000
--- a/tests/unit/test_spec_utils.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import unittest
-import numpy as np
-from audio_separator.separator.spec_utils import crop_center, preprocess, make_padding, wave_to_spectrogram, wave_to_spectrogram_mt
-
-
-class TestSpecUtils(unittest.TestCase):
-    def test_preprocess(self):
-        X_spec = np.random.rand(10, 10) + 1j * np.random.rand(10, 10)
-        X_mag, X_phase = preprocess(X_spec)
-        self.assertEqual(X_mag.shape, X_spec.shape)
-        self.assertEqual(X_phase.shape, X_spec.shape)
-
-    def test_make_padding(self):
-        width, cropsize, offset = 100, 50, 10
-        left, right, roi_size = make_padding(width, cropsize, offset)
-        self.assertEqual(left, 10)
-        self.assertTrue(right >= left)
-        self.assertEqual(roi_size, 30)
-
-    def test_preprocess_values(self):
-        X_spec = np.random.rand(10, 10) + 1j * np.random.rand(10, 10)
-        X_mag, X_phase = preprocess(X_spec)
-        self.assertTrue((X_mag >= 0).all())
-        self.assertTrue((X_phase >= -np.pi).all() and (X_phase <= np.pi).all())
-
-    def test_make_padding_values(self):
-        width, cropsize, offset = 100, 50, 10
-        left, right, roi_size = make_padding(width, cropsize, offset)
-        self.assertTrue(left >= 0)
-        self.assertTrue(right >= 0)
-        self.assertTrue(roi_size > 0)
-
-    def test_preprocess_magnitude_phase(self):
-        X_spec = np.random.rand(5, 5) + 1j * np.random.rand(5, 5)
-        X_mag, X_phase = preprocess(X_spec)
-        self.assertTrue(np.all(X_mag >= 0))
-        self.assertTrue(np.all(X_phase >= -np.pi) and np.all(X_phase <= np.pi))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/unit/test_stft.py b/tests/unit/test_stft.py
index 49ebe92..7ac92e5 100644
--- a/tests/unit/test_stft.py
+++ b/tests/unit/test_stft.py
@@ -2,7 +2,7 @@
 import numpy as np
 import torch
 from unittest.mock import Mock, patch
-from audio_separator.separator.stft import STFT
+from audio_separator.separator.uvr_lib_v5.stft import STFT
 
 # Short-Time Fourier Transform (STFT) Process Overview:
 #