diff --git a/.gitignore b/.gitignore index 10d6a9f..23d1136 100644 --- a/.gitignore +++ b/.gitignore @@ -6,7 +6,8 @@ /tracks/ /lyrics/ /.cache/ -/models/*.onnx +*.onnx +*.pth *.wav *.flac *.mp3 diff --git a/README.md b/README.md index 10759fd..92ffd88 100644 --- a/README.md +++ b/README.md @@ -168,10 +168,10 @@ separator = Separator() separator.load_model() # Perform the separation on specific audio files without reloading the model -primary_stem_path, secondary_stem_path = separator.separate('audio1.wav') +primary_stem_output_path, secondary_stem_output_path = separator.separate('audio1.wav') -print(f'Primary stem saved at {primary_stem_path}') -print(f'Secondary stem saved at {secondary_stem_path}') +print(f'Primary stem saved at {primary_stem_output_path}') +print(f'Secondary stem saved at {secondary_stem_output_path}') ``` #### Batch processing, or processing with multiple models @@ -212,7 +212,7 @@ output_file_paths_6 = separator.separate('audio3.wav') - model_file_dir: (Optional) Directory to cache model files in. Default: /tmp/audio-separator-models/ - output_dir: (Optional) Directory where the separated files will be saved. If not specified, outputs to current dir. - output_format: (Optional) Format to encode output files, any common format (WAV, MP3, FLAC, M4A, etc.). Default: WAV -- denoise_enabled: (Optional) Flag to enable or disable denoising as part of the separation process. Default: True +- enable_denoise: (Optional) Flag to enable or disable denoising as part of the separation process. Default: True - normalization_enabled: (Optional) Flag to enable or disable normalization as part of the separation process. Default: False - output_single_stem: (Optional) Output only single stem, either instrumental or vocals. - invert_secondary_stem_using_spectogram=True, diff --git a/audio_separator/separator/architectures/__init__.py b/audio_separator/separator/architectures/__init__.py new file mode 100644 index 0000000..b76cc7d --- /dev/null +++ b/audio_separator/separator/architectures/__init__.py @@ -0,0 +1,3 @@ +from .mdx_separator import MDXSeparator +from .vr_separator import VRSeparator + diff --git a/audio_separator/separator/architectures/mdx_separator.py b/audio_separator/separator/architectures/mdx_separator.py new file mode 100644 index 0000000..a8acc41 --- /dev/null +++ b/audio_separator/separator/architectures/mdx_separator.py @@ -0,0 +1,426 @@ +"""Module for separating audio sources using MDX architecture models.""" + +import os +import torch +import librosa +import onnxruntime as ort +import numpy as np +import onnx2torch +from tqdm import tqdm +from audio_separator.separator.uvr_lib_v5 import spec_utils +from audio_separator.separator.uvr_lib_v5.stft import STFT +from audio_separator.separator.common_separator import CommonSeparator + + +class MDXSeparator(CommonSeparator): + """ + MDXSeparator is responsible for separating audio sources using MDX models. + It initializes with configuration parameters and prepares the model for separation tasks. + """ + + def __init__(self, common_config, arch_config): + super().__init__(config=common_config) + + self.hop_length = arch_config.get("hop_length") + self.segment_size = arch_config.get("segment_size") + self.overlap = arch_config.get("overlap") + + # Initializing model parameters + self.compensate = self.model_data["compensate"] + self.dim_f = self.model_data["mdx_dim_f_set"] + self.dim_t = 2 ** self.model_data["mdx_dim_t_set"] + self.n_fft = self.model_data["mdx_n_fft_scale_set"] + + self.config_yaml = self.model_data.get("config_yaml", None) + + # Number of batches to be processed at a time. + # - Higher values mean more RAM usage but slightly faster processing times. + # - Lower values mean less RAM usage but slightly longer processing times. + # - Batch size value has no effect on output quality. + # BATCH_SIZE = ('1', ''2', '3', '4', '5', '6', '7', '8', '9', '10') + self.batch_size = arch_config.get("batch_size", 1) + + self.logger.debug(f"Model params: primary_stem={self.primary_stem_name}, secondary_stem={self.secondary_stem_name}") + self.logger.debug(f"Model params: batch_size={self.batch_size}, compensate={self.compensate}, segment_size={self.segment_size}, dim_f={self.dim_f}, dim_t={self.dim_t}") + self.logger.debug(f"Model params: n_fft={self.n_fft}, hop={self.hop_length}") + + # Loading the model for inference + self.logger.debug("Loading ONNX model for inference...") + if self.segment_size == self.dim_t: + ort_inference_session = ort.InferenceSession(self.model_path, providers=self.onnx_execution_provider) + self.model_run = lambda spek: ort_inference_session.run(None, {"input": spek.cpu().numpy()})[0] + self.logger.debug("Model loaded successfully using ONNXruntime inferencing session.") + else: + self.model_run = onnx2torch.convert(self.model_path) + self.model_run.to(self.torch_device).eval() + self.logger.warning("Model converted from onnx to pytorch due to segment size not matching dim_t, processing may be slower.") + + self.n_bins = 0 + self.trim = 0 + self.chunk_size = 0 + self.gen_size = 0 + self.stft = None + + self.primary_source = None + self.secondary_source = None + self.audio_file_path = None + self.audio_file_base = None + self.secondary_source_map = None + self.primary_source_map = None + + def separate(self, audio_file_path): + """ + Separates the audio file into primary and secondary sources based on the model's configuration. + It processes the mix, demixes it into sources, normalizes the sources, and saves the output files. + + Args: + audio_file_path (str): The path to the audio file to be processed. + + Returns: + list: A list of paths to the output files generated by the separation process. + """ + self.primary_source = None + self.secondary_source = None + + self.audio_file_path = audio_file_path + self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0] + + # Prepare the mix for processing + self.logger.debug("Preparing mix...") + mix = self.prepare_mix(self.audio_file_path) + + self.logger.debug("Normalizing mix before demixing...") + mix = spec_utils.normalize(wave=mix, max_peak=self.normalization_threshold) + + # Start the demixing process + source = self.demix(mix) + self.logger.debug("Demixing completed.") + + # In UVR, the source is cached here if it's a vocal split model, but we're not supporting that yet + + # Initialize the list for output files + output_files = [] + self.logger.debug("Processing output files...") + + # Normalize and transpose the primary source if it's not already an array + if not isinstance(self.primary_source, np.ndarray): + self.logger.debug("Normalizing primary source...") + self.primary_source = spec_utils.normalize(wave=source, max_peak=self.normalization_threshold).T + + # Process the secondary source if not already an array + if not isinstance(self.secondary_source, np.ndarray): + self.logger.debug("Producing secondary source: demixing in match_mix mode") + raw_mix = self.demix(mix, is_match_mix=True) + + if self.invert_using_spec: + self.logger.debug("Inverting secondary stem using spectogram as invert_using_spec is set to True") + self.secondary_source = spec_utils.invert_stem(raw_mix, source) + else: + self.logger.debug("Inverting secondary stem by subtracting of transposed demixed stem from transposed original mix") + self.secondary_source = mix.T - source.T + + # Save and process the secondary stem if needed + if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower(): + self.logger.info(f"Saving {self.secondary_stem_name} stem...") + if not self.secondary_stem_output_path: + self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}") + self.secondary_source_map = self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name) + output_files.append(self.secondary_stem_output_path) + + # Save and process the primary stem if needed + if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower(): + self.logger.info(f"Saving {self.primary_stem_name} stem...") + if not self.primary_stem_output_path: + self.primary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}") + if not isinstance(self.primary_source, np.ndarray): + self.primary_source = source.T + self.primary_source_map = self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name) + output_files.append(self.primary_stem_output_path) + + # Not yet implemented from UVR features: + # self.process_vocal_split_chain(secondary_sources) + # self.logger.debug("Vocal split chain processed.") + + return output_files + + def initialize_model_settings(self): + """ + This function sets up the necessary parameters for the model, like the number of frequency bins (n_bins), the trimming size (trim), + the size of each audio chunk (chunk_size), and the window function for spectral transformations (window). + It ensures that the model is configured with the correct settings for processing the audio data. + """ + self.logger.debug("Initializing model settings...") + + # n_bins is half the FFT size plus one (self.n_fft // 2 + 1). + self.n_bins = self.n_fft // 2 + 1 + + # trim is half the FFT size (self.n_fft // 2). + self.trim = self.n_fft // 2 + + # chunk_size is the hop_length size times the segment size minus one + self.chunk_size = self.hop_length * (self.segment_size - 1) + + # gen_size is the chunk size minus twice the trim size + self.gen_size = self.chunk_size - 2 * self.trim + + self.stft = STFT(self.logger, self.n_fft, self.hop_length, self.dim_f, self.torch_device) + + self.logger.debug(f"Model input params: n_fft={self.n_fft} hop_length={self.hop_length} dim_f={self.dim_f}") + self.logger.debug(f"Model settings: n_bins={self.n_bins}, trim={self.trim}, chunk_size={self.chunk_size}, gen_size={self.gen_size}") + + def initialize_mix(self, mix, is_ckpt=False): + """ + After prepare_mix segments the audio, initialize_mix further processes each segment. + It ensures each audio segment is in the correct format for the model, applies necessary padding, + and converts the segments into tensors for processing with the model. + This step is essential for preparing the audio data in a format that the neural network can process. + """ + # Log the initialization of the mix and whether checkpoint mode is used + self.logger.debug(f"Initializing mix with is_ckpt={is_ckpt}. Initial mix shape: {mix.shape}") + + # Ensure the mix is a 2-channel (stereo) audio signal + if mix.shape[0] != 2: + error_message = f"Expected a 2-channel audio signal, but got {mix.shape[0]} channels" + self.logger.error(error_message) + raise ValueError(error_message) + + # If in checkpoint mode, process the mix differently + if is_ckpt: + self.logger.debug("Processing in checkpoint mode...") + # Calculate padding based on the generation size and trim + pad = self.gen_size + self.trim - (mix.shape[-1] % self.gen_size) + self.logger.debug(f"Padding calculated: {pad}") + # Add padding at the beginning and the end of the mix + mixture = np.concatenate((np.zeros((2, self.trim), dtype="float32"), mix, np.zeros((2, pad), dtype="float32")), 1) + # Determine the number of chunks based on the mixture's length + num_chunks = mixture.shape[-1] // self.gen_size + self.logger.debug(f"Mixture shape after padding: {mixture.shape}, Number of chunks: {num_chunks}") + # Split the mixture into chunks + mix_waves = [mixture[:, i * self.gen_size : i * self.gen_size + self.chunk_size] for i in range(num_chunks)] + else: + # If not in checkpoint mode, process normally + self.logger.debug("Processing in non-checkpoint mode...") + mix_waves = [] + n_sample = mix.shape[1] + # Calculate necessary padding to make the total length divisible by the generation size + pad = self.gen_size - n_sample % self.gen_size + self.logger.debug(f"Number of samples: {n_sample}, Padding calculated: {pad}") + # Apply padding to the mix + mix_p = np.concatenate((np.zeros((2, self.trim)), mix, np.zeros((2, pad)), np.zeros((2, self.trim))), 1) + self.logger.debug(f"Shape of mix after padding: {mix_p.shape}") + + # Process the mix in chunks + i = 0 + while i < n_sample + pad: + waves = np.array(mix_p[:, i : i + self.chunk_size]) + mix_waves.append(waves) + self.logger.debug(f"Processed chunk {len(mix_waves)}: Start {i}, End {i + self.chunk_size}") + i += self.gen_size + + # Convert the list of wave chunks into a tensor for processing on the specified device + mix_waves_tensor = torch.tensor(mix_waves, dtype=torch.float32).to(self.torch_device) + self.logger.debug(f"Converted mix_waves to tensor. Tensor shape: {mix_waves_tensor.shape}") + + return mix_waves_tensor, pad + + def demix(self, mix, is_match_mix=False): + """ + Demixes the input mix into its constituent sources. If is_match_mix is True, the function adjusts the processing + to better match the mix, affecting chunk sizes and overlaps. The demixing process involves padding the mix, + processing it in chunks, applying windowing for overlaps, and accumulating the results to separate the sources. + """ + self.logger.debug(f"Starting demixing process with is_match_mix: {is_match_mix}...") + self.initialize_model_settings() + + # Preserves the original mix for later use. + # In UVR, this is used for the pitch fix and VR denoise processes, which aren't yet implemented here. + org_mix = mix + self.logger.debug(f"Original mix stored. Shape: {org_mix.shape}") + + # Initializes a list to store the separated waveforms. + tar_waves_ = [] + + # Handling different chunk sizes and overlaps based on the matching requirement. + if is_match_mix: + # Sets a smaller chunk size specifically for matching the mix. + chunk_size = self.hop_length * (self.segment_size - 1) + # Sets a small overlap for the chunks. + overlap = 0.02 + self.logger.debug(f"Chunk size for matching mix: {chunk_size}, Overlap: {overlap}") + else: + # Uses the regular chunk size defined in model settings. + chunk_size = self.chunk_size + # Uses the overlap specified in the model settings. + overlap = self.overlap + self.logger.debug(f"Standard chunk size: {chunk_size}, Overlap: {overlap}") + + # Calculates the generated size after subtracting the trim from both ends of the chunk. + gen_size = chunk_size - 2 * self.trim + self.logger.debug(f"Generated size calculated: {gen_size}") + + # Calculates padding to make the mix length a multiple of the generated size. + pad = gen_size + self.trim - ((mix.shape[-1]) % gen_size) + # Prepares the mixture with padding at the beginning and the end. + mixture = np.concatenate((np.zeros((2, self.trim), dtype="float32"), mix, np.zeros((2, pad), dtype="float32")), 1) + self.logger.debug(f"Mixture prepared with padding. Mixture shape: {mixture.shape}") + + # Calculates the step size for processing chunks based on the overlap. + step = int((1 - overlap) * chunk_size) + self.logger.debug(f"Step size for processing chunks: {step} as overlap is set to {overlap}.") + + # Initializes arrays to store the results and to account for overlap. + result = np.zeros((1, 2, mixture.shape[-1]), dtype=np.float32) + divider = np.zeros((1, 2, mixture.shape[-1]), dtype=np.float32) + + # Initializes counters for processing chunks. + total = 0 + total_chunks = (mixture.shape[-1] + step - 1) // step + self.logger.debug(f"Total chunks to process: {total_chunks}") + + # Processes each chunk of the mixture. + for i in tqdm(range(0, mixture.shape[-1], step)): + total += 1 + start = i + end = min(i + chunk_size, mixture.shape[-1]) + self.logger.debug(f"Processing chunk {total}/{total_chunks}: Start {start}, End {end}") + + # Handles windowing for overlapping chunks. + chunk_size_actual = end - start + window = None + if overlap != 0: + window = np.hanning(chunk_size_actual) + window = np.tile(window[None, None, :], (1, 2, 1)) + self.logger.debug("Window applied to the chunk.") + + # Zero-pad the chunk to prepare it for processing. + mix_part_ = mixture[:, start:end] + if end != i + chunk_size: + pad_size = (i + chunk_size) - end + mix_part_ = np.concatenate((mix_part_, np.zeros((2, pad_size), dtype="float32")), axis=-1) + + # Converts the chunk to a tensor for processing. + mix_part = torch.tensor([mix_part_], dtype=torch.float32).to(self.torch_device) + # Splits the chunk into smaller batches if necessary. + mix_waves = mix_part.split(self.batch_size) + total_batches = len(mix_waves) + self.logger.debug(f"Mix part split into batches. Number of batches: {total_batches}") + + with torch.no_grad(): + # Processes each batch in the chunk. + batches_processed = 0 + for mix_wave in mix_waves: + batches_processed += 1 + self.logger.debug(f"Processing mix_wave batch {batches_processed}/{total_batches}") + + # Runs the model to separate the sources. + tar_waves = self.run_model(mix_wave, is_match_mix=is_match_mix) + + # Applies windowing if needed and accumulates the results. + if window is not None: + tar_waves[..., :chunk_size_actual] *= window + divider[..., start:end] += window + else: + divider[..., start:end] += 1 + + result[..., start:end] += tar_waves[..., : end - start] + + # Normalizes the results by the divider to account for overlap. + self.logger.debug("Normalizing result by dividing result by divider.") + tar_waves = result / divider + tar_waves_.append(tar_waves) + + # Reshapes the results to match the original dimensions. + tar_waves_ = np.vstack(tar_waves_)[:, :, self.trim : -self.trim] + tar_waves = np.concatenate(tar_waves_, axis=-1)[:, : mix.shape[-1]] + + # Extracts the source from the results. + source = tar_waves[:, 0:None] + self.logger.debug(f"Concatenated tar_waves. Shape: {tar_waves.shape}") + + # TODO: In UVR, pitch changing happens here. Consider implementing this as a feature. + + # Compensates the source if not matching the mix. + if not is_match_mix: + source *= self.compensate + self.logger.debug("Match mix mode; compensate multiplier applied.") + + # TODO: In UVR, VR denoise model gets applied here. Consider implementing this as a feature. + + self.logger.debug("Demixing process completed.") + return source + + def run_model(self, mix, is_match_mix=False): + """ + Processes the input mix through the model to separate the sources. + Applies STFT, handles spectrum modifications, and runs the model for source separation. + """ + # Applying the STFT to the mix. The mix is moved to the specified device (e.g., GPU) before processing. + # self.logger.debug(f"Running STFT on the mix. Mix shape before STFT: {mix.shape}") + spek = self.stft(mix.to(self.torch_device)) + self.logger.debug(f"STFT applied on mix. Spectrum shape: {spek.shape}") + + # Zeroing out the first 3 bins of the spectrum. This is often done to reduce low-frequency noise. + spek[:, :, :3, :] *= 0 + # self.logger.debug("First 3 bins of the spectrum zeroed out.") + + # Handling the case where the mix needs to be matched (is_match_mix = True) + if is_match_mix: + # self.logger.debug("Match mix mode is enabled. Converting spectrum to NumPy array.") + spec_pred = spek.cpu().numpy() + self.logger.debug("is_match_mix: spectrum prediction obtained directly from STFT output.") + else: + # If denoising is enabled, the model is run on both the negative and positive spectrums. + if self.enable_denoise: + # Assuming spek is a tensor and self.model_run can process it directly + spec_pred_neg = self.model_run(-spek) # Ensure this line correctly negates spek and runs the model + spec_pred_pos = self.model_run(spek) + # Ensure both spec_pred_neg and spec_pred_pos are tensors before applying operations + spec_pred = (spec_pred_neg * -0.5) + (spec_pred_pos * 0.5) # [invalid-unary-operand-type] + self.logger.debug("Model run on both negative and positive spectrums for denoising.") + else: + spec_pred = self.model_run(spek) + self.logger.debug("Model run on the spectrum without denoising.") + + # Applying the inverse STFT to convert the spectrum back to the time domain. + result = self.stft.inverse(torch.tensor(spec_pred).to(self.torch_device)).cpu().detach().numpy() + self.logger.debug(f"Inverse STFT applied. Returning result with shape: {result.shape}") + + return result + + def prepare_mix(self, mix): + """ + Prepares the mix for processing. This includes loading the audio from a file if necessary, + ensuring the mix is in the correct format, and converting mono to stereo if needed. + """ + # Store the original path or the mix itself for later checks + audio_path = mix + + # Check if the input is a file path (string) and needs to be loaded + if not isinstance(mix, np.ndarray): + self.logger.debug(f"Loading audio from file: {mix}") + mix, sr = librosa.load(mix, mono=False, sr=self.sample_rate) + self.logger.debug(f"Audio loaded. Sample rate: {sr}, Audio shape: {mix.shape}") + else: + # Transpose the mix if it's already an ndarray (expected shape: [channels, samples]) + self.logger.debug("Transposing the provided mix array.") + mix = mix.T + self.logger.debug(f"Transposed mix shape: {mix.shape}") + + # If the original input was a filepath, check if the loaded mix is empty + if isinstance(audio_path, str): + if not np.any(mix): + error_msg = f"Audio file {audio_path} is empty or not valid" + self.logger.error(error_msg) + raise ValueError(error_msg) + else: + self.logger.debug("Audio file is valid and contains data.") + + # Ensure the mix is in stereo format + if mix.ndim == 1: + self.logger.debug("Mix is mono. Converting to stereo.") + mix = np.asfortranarray([mix, mix]) + self.logger.debug("Converted to stereo mix.") + + # Final log indicating successful preparation of the mix + self.logger.debug("Mix preparation completed.") + return mix diff --git a/audio_separator/separator/architectures/vr_separator.py b/audio_separator/separator/architectures/vr_separator.py new file mode 100644 index 0000000..2bb0cf3 --- /dev/null +++ b/audio_separator/separator/architectures/vr_separator.py @@ -0,0 +1,337 @@ +"""Module for separating audio sources using VR architecture models.""" + +import os +import math + +import torch +import librosa +import numpy as np +from tqdm import tqdm + +# Check if we really need the rerun_mp3 function, remove if not +import audioread + +from audio_separator.separator.common_separator import CommonSeparator +from audio_separator.separator.uvr_lib_v5 import spec_utils +from audio_separator.separator.uvr_lib_v5.vr_network import nets +from audio_separator.separator.uvr_lib_v5.vr_network import nets_new +from audio_separator.separator.uvr_lib_v5.vr_network.model_param_init import ModelParameters + + +class VRSeparator(CommonSeparator): + """ + VRSeparator is responsible for separating audio sources using VR models. + It initializes with configuration parameters and prepares the model for separation tasks. + """ + + def __init__(self, common_config, arch_config: dict): + # Any configuration values which can be shared between architectures should be set already in CommonSeparator, + # e.g. user-specified functionality choices (self.output_single_stem) or common model parameters (self.primary_stem_name) + super().__init__(config=common_config) + + # Model data is basic overview metadata about the model, e.g. which stem is primary and whether it's a karaoke model + # It's loaded in from model_data_new.json in Separator.load_model and there are JSON examples in that method + # The instance variable self.model_data is passed through from Separator and set in CommonSeparator + self.logger.debug(f"Model data: {self.model_data}") + + # Most of the VR models use the same number of output channels, but the VR 51 models have specific values set in model_data JSON + self.model_capacity = 32, 128 + self.is_vr_51_model = False + + if "nout" in self.model_data.keys() and "nout_lstm" in self.model_data.keys(): + self.model_capacity = self.model_data["nout"], self.model_data["nout_lstm"] + self.is_vr_51_model = True + + # Model params are additional technical parameter values from JSON files in separator/uvr_lib_v5/vr_network/modelparams/*.json, + # with filenames referenced by the model_data["vr_model_param"] value + package_root_filepath = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + vr_params_json_dir = os.path.join(package_root_filepath, "uvr_lib_v5", "vr_network", "modelparams") + vr_params_json_filename = f"{self.model_data['vr_model_param']}.json" + vr_params_json_filepath = os.path.join(vr_params_json_dir, vr_params_json_filename) + self.model_params = ModelParameters(vr_params_json_filepath) + + self.logger.debug(f"Model params: {self.model_params.param}") + + # Arch Config is the VR architecture specific user configuration options, which should all be configurable by the user + # either by their Separator class instantiation or by passing in a CLI parameter. + # While there are similarities between architectures for some of these (e.g. batch_size), they are deliberately configured + # this way as they have architecture-specific default values. + + # This option performs Test-Time-Augmentation to improve the separation quality. + # Note: Having this selected will increase the time it takes to complete a conversion + self.enable_tta = arch_config.get("enable_tta", False) + + # This option can potentially identify leftover instrumental artifacts within the vocal outputs. \nThis option may improve the separation of some songs. + # Note: Selecting this option can adversely affect the conversion process, depending on the track. Because of this, it is only recommended as a last resort. + self.enable_post_process = arch_config.get("enable_post_process", False) + + # post_process_threshold values = ('0.1', '0.2', '0.3') + self.post_process_threshold = arch_config.get("post_process_threshold", 0.2) + + # Number of batches to be processed at a time. + # - Higher values mean more RAM usage but slightly faster processing times. + # - Lower values mean less RAM usage but slightly longer processing times. + # - Batch size value has no effect on output quality. + + # Andrew note: for some reason, lower batch sizes seem to cause broken output for VR arch; need to investigate why + self.batch_size = arch_config.get("batch_size", 16) + + # 'Select window size to balance quality and speed:\n\n' + # '• 1024 - Quick but lesser quality.\n' + # '• 512 - Medium speed and quality.\n' + # '• 320 - Takes longer but may offer better quality.' + self.window_size = arch_config.get("window_size", 512) + + # The application will mirror the missing frequency range of the output. + self.high_end_process = arch_config.get("high_end_process", False) + self.input_high_end_h = None + self.input_high_end = None + + # Adjust the intensity of primary stem extraction: + # - Ranges from -100 - 100. + # - Bigger values mean deeper extractions. + # - Typically, it's set to 5 for vocals & instrumentals. + # - Values beyond 5 might muddy the sound for non-vocal models. + self.aggression = arch_config.get("aggression", 5) + + self.aggressiveness = {"value": self.aggression, "split_bin": self.model_params.param["band"][1]["crop_stop"], "aggr_correction": self.model_params.param.get("aggr_correction")} + + self.model_samplerate = self.model_params.param["sr"] + + self.logger.debug(f"VR arch params: enable_tta={self.enable_tta}, enable_post_process={self.enable_post_process}, post_process_threshold={self.post_process_threshold}") + self.logger.debug(f"VR arch params: batch_size={self.batch_size}, window_size={self.window_size}") + self.logger.debug(f"VR arch params: high_end_process={self.high_end_process}, aggression={self.aggression}") + self.logger.debug(f"VR arch params: is_vr_51_model={self.is_vr_51_model}, model_samplerate={self.model_samplerate}, model_capacity={self.model_capacity}") + + self.model_run = lambda *args, **kwargs: self.logger.error("Model run method is not initialised yet.") + + # This should go away once we refactor to remove soundfile.write and replace with pydub like we did for the MDX rewrite + self.wav_subtype = "PCM_16" + + self.logger.info("VR Separator initialisation complete") + + def separate(self, audio_file_path): + """ + Separates the audio file into primary and secondary sources based on the model's configuration. + It processes the mix, demixes it into sources, normalizes the sources, and saves the output files. + + Args: + audio_file_path (str): The path to the audio file to be processed. + + Returns: + list: A list of paths to the output files generated by the separation process. + """ + self.primary_source = None + self.secondary_source = None + + self.audio_file_path = audio_file_path + self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0] + + self.logger.debug("Starting inference...") + + nn_arch_sizes = [31191, 33966, 56817, 123821, 123812, 129605, 218409, 537238, 537227] # default + vr_5_1_models = [56817, 218409] + model_size = math.ceil(os.stat(self.model_path).st_size / 1024) + nn_arch_size = min(nn_arch_sizes, key=lambda x: abs(x - model_size)) + self.logger.debug(f"Model size determined: {model_size}, NN architecture size: {nn_arch_size}") + + if nn_arch_size in vr_5_1_models or self.is_vr_51_model: + self.logger.debug("Using CascadedNet for VR 5.1 model...") + self.model_run = nets_new.CascadedNet(self.model_params.param["bins"] * 2, nn_arch_size, nout=self.model_capacity[0], nout_lstm=self.model_capacity[1]) + self.is_vr_51_model = True + else: + self.logger.debug("Determining model capacity...") + self.model_run = nets.determine_model_capacity(self.model_params.param["bins"] * 2, nn_arch_size) + + self.model_run.load_state_dict(torch.load(self.model_path, map_location=self.torch_device_cpu)) + self.model_run.to(self.torch_device) + self.logger.debug("Model loaded and moved to device.") + + y_spec, v_spec = self.inference_vr(self.loading_mix(), self.torch_device, self.aggressiveness) + self.logger.debug("Inference completed.") + + # Not yet implemented from UVR features: + # + # if not self.is_vocal_split_model: + # self.cache_source((y_spec, v_spec)) + + # if self.is_secondary_model_activated and self.secondary_model: + # self.logger.debug("Processing secondary model...") + # self.secondary_source_primary, self.secondary_source_secondary = process_secondary_model( + # self.secondary_model, self.process_data, main_process_method=self.process_method, main_model_primary=self.primary_stem + # ) + + # Initialize the list for output files + output_files = [] + self.logger.debug("Processing output files...") + + # Save and process the primary stem if needed + if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower(): + self.logger.info(f"Saving {self.primary_stem_name} stem...") + if not self.primary_stem_output_path: + self.primary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}") + + if not isinstance(self.primary_source, np.ndarray): + self.primary_source = self.spec_to_wav(y_spec).T + self.logger.debug("Converting primary source spectrogram to waveform.") + if not self.model_samplerate == 44100: + self.primary_source = librosa.resample(self.primary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T + self.logger.debug("Resampling primary source to 44100Hz.") + + self.primary_source_map = self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name) + self.logger.debug("Primary stem processed.") + output_files.append(self.primary_stem_output_path) + + # Save and process the secondary stem if needed + if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower(): + self.logger.info(f"Saving {self.secondary_stem_name} stem...") + if not self.secondary_stem_output_path: + self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}") + + self.logger.debug(f"Processing secondary stem: {self.secondary_stem_name}") + if not isinstance(self.secondary_source, np.ndarray): + self.secondary_source = self.spec_to_wav(v_spec).T + self.logger.debug("Converting secondary source spectrogram to waveform.") + if not self.model_samplerate == 44100: + self.secondary_source = librosa.resample(self.secondary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T + self.logger.debug("Resampling secondary source to 44100Hz.") + + self.secondary_source_map = self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name) + self.logger.debug("Secondary stem processed.") + output_files.append(self.secondary_stem_output_path) + + # Not yet implemented from UVR features: + # self.process_vocal_split_chain(secondary_sources) + # self.logger.debug("Vocal split chain processed.") + + return output_files + + def loading_mix(self): + X_wave, X_spec_s = {}, {} + + bands_n = len(self.model_params.param["band"]) + + audio_file = spec_utils.write_array_to_mem(self.audio_file_path, subtype=self.wav_subtype) + is_mp3 = audio_file.endswith(".mp3") if isinstance(audio_file, str) else False + + self.logger.debug(f"loading_mix iteraring through {bands_n} bands") + for d in tqdm(range(bands_n, 0, -1)): + bp = self.model_params.param["band"][d] + + wav_resolution = bp["res_type"] + + if self.torch_device_mps is not None: + wav_resolution = "polyphase" + + if d == bands_n: # high-end band + X_wave[d], _ = librosa.load(audio_file, sr=bp["sr"], mono=False, dtype=np.float32, res_type=wav_resolution) + X_spec_s[d] = spec_utils.wave_to_spectrogram(X_wave[d], bp["hl"], bp["n_fft"], self.model_params, band=d, is_v51_model=self.is_vr_51_model) + + if not np.any(X_wave[d]) and is_mp3: + X_wave[d] = rerun_mp3(audio_file, bp["sr"]) + + if X_wave[d].ndim == 1: + X_wave[d] = np.asarray([X_wave[d], X_wave[d]]) + else: # lower bands + X_wave[d] = librosa.resample(X_wave[d + 1], orig_sr=self.model_params.param["band"][d + 1]["sr"], target_sr=bp["sr"], res_type=wav_resolution) + X_spec_s[d] = spec_utils.wave_to_spectrogram(X_wave[d], bp["hl"], bp["n_fft"], self.model_params, band=d, is_v51_model=self.is_vr_51_model) + + if d == bands_n and self.high_end_process: + self.input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (self.model_params.param["pre_filter_stop"] - self.model_params.param["pre_filter_start"]) + self.input_high_end = X_spec_s[d][:, bp["n_fft"] // 2 - self.input_high_end_h : bp["n_fft"] // 2, :] + + X_spec = spec_utils.combine_spectrograms(X_spec_s, self.model_params, is_v51_model=self.is_vr_51_model) + + del X_wave, X_spec_s, audio_file + + return X_spec + + def inference_vr(self, X_spec, device, aggressiveness): + def _execute(X_mag_pad, roi_size): + X_dataset = [] + patches = (X_mag_pad.shape[2] - 2 * self.model_run.offset) // roi_size + + self.logger.debug(f"inference_vr appending to X_dataset for each of {patches} patches") + for i in tqdm(range(patches)): + start = i * roi_size + X_mag_window = X_mag_pad[:, :, start : start + self.window_size] + X_dataset.append(X_mag_window) + + total_iterations = patches // self.batch_size if not self.enable_tta else (patches // self.batch_size) * 2 + self.logger.debug(f"inference_vr iterating through {total_iterations} batches, batch_size = {self.batch_size}") + + X_dataset = np.asarray(X_dataset) + self.model_run.eval() + with torch.no_grad(): + mask = [] + + for i in tqdm(range(0, patches, self.batch_size)): + + X_batch = X_dataset[i : i + self.batch_size] + X_batch = torch.from_numpy(X_batch).to(device) + pred = self.model_run.predict_mask(X_batch) + if not pred.size()[3] > 0: + raise ValueError(f"Window size error: h1_shape[3] must be greater than h2_shape[3]") + pred = pred.detach().cpu().numpy() + pred = np.concatenate(pred, axis=2) + mask.append(pred) + if len(mask) == 0: + raise ValueError(f"Window size error: h1_shape[3] must be greater than h2_shape[3]") + + mask = np.concatenate(mask, axis=2) + return mask + + def postprocess(mask, X_mag, X_phase): + is_non_accom_stem = False + for stem in CommonSeparator.NON_ACCOM_STEMS: + if stem == self.primary_stem_name: + is_non_accom_stem = True + + mask = spec_utils.adjust_aggr(mask, is_non_accom_stem, aggressiveness) + + if self.enable_post_process: + mask = spec_utils.merge_artifacts(mask, thres=self.post_process_threshold) + + y_spec = mask * X_mag * np.exp(1.0j * X_phase) + v_spec = (1 - mask) * X_mag * np.exp(1.0j * X_phase) + + return y_spec, v_spec + + X_mag, X_phase = spec_utils.preprocess(X_spec) + n_frame = X_mag.shape[2] + pad_l, pad_r, roi_size = spec_utils.make_padding(n_frame, self.window_size, self.model_run.offset) + X_mag_pad = np.pad(X_mag, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") + X_mag_pad /= X_mag_pad.max() + mask = _execute(X_mag_pad, roi_size) + + if self.enable_tta: + pad_l += roi_size // 2 + pad_r += roi_size // 2 + X_mag_pad = np.pad(X_mag, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") + X_mag_pad /= X_mag_pad.max() + mask_tta = _execute(X_mag_pad, roi_size) + mask_tta = mask_tta[:, :, roi_size // 2 :] + mask = (mask[:, :, :n_frame] + mask_tta[:, :, :n_frame]) * 0.5 + else: + mask = mask[:, :, :n_frame] + + y_spec, v_spec = postprocess(mask, X_mag, X_phase) + + return y_spec, v_spec + + def spec_to_wav(self, spec): + if self.high_end_process and isinstance(self.input_high_end, np.ndarray) and self.input_high_end_h: + input_high_end_ = spec_utils.mirroring("mirroring", spec, self.input_high_end, self.model_params) + wav = spec_utils.cmb_spectrogram_to_wave(spec, self.model_params, self.input_high_end_h, input_high_end_, is_v51_model=self.is_vr_51_model) + else: + wav = spec_utils.cmb_spectrogram_to_wave(spec, self.model_params, is_v51_model=self.is_vr_51_model) + + return wav + + +# Check if we really need the rerun_mp3 function, refactor or remove if not +def rerun_mp3(audio_file, sample_rate=44100): + with audioread.audio_open(audio_file) as f: + track_length = int(f.duration) + + return librosa.load(audio_file, duration=track_length, mono=False, sr=sample_rate)[0] diff --git a/audio_separator/separator/common_separator.py b/audio_separator/separator/common_separator.py new file mode 100644 index 0000000..297e6fa --- /dev/null +++ b/audio_separator/separator/common_separator.py @@ -0,0 +1,232 @@ +""" This file contains the CommonSeparator class, common to all architecture-specific Separator classes. """ + +from logging import Logger +import os +import numpy as np +from pydub import AudioSegment +from audio_separator.separator.uvr_lib_v5 import spec_utils + + +class CommonSeparator: + """ + This class contains the common methods and attributes common to all architecture-specific Separator classes. + """ + + ALL_STEMS = "All Stems" + VOCAL_STEM = "Vocals" + INST_STEM = "Instrumental" + OTHER_STEM = "Other" + BASS_STEM = "Bass" + DRUM_STEM = "Drums" + GUITAR_STEM = "Guitar" + PIANO_STEM = "Piano" + SYNTH_STEM = "Synthesizer" + STRINGS_STEM = "Strings" + WOODWINDS_STEM = "Woodwinds" + BRASS_STEM = "Brass" + WIND_INST_STEM = "Wind Inst" + NO_OTHER_STEM = "No Other" + NO_BASS_STEM = "No Bass" + NO_DRUM_STEM = "No Drums" + NO_GUITAR_STEM = "No Guitar" + NO_PIANO_STEM = "No Piano" + NO_SYNTH_STEM = "No Synthesizer" + NO_STRINGS_STEM = "No Strings" + NO_WOODWINDS_STEM = "No Woodwinds" + NO_WIND_INST_STEM = "No Wind Inst" + NO_BRASS_STEM = "No Brass" + PRIMARY_STEM = "Primary Stem" + SECONDARY_STEM = "Secondary Stem" + LEAD_VOCAL_STEM = "lead_only" + BV_VOCAL_STEM = "backing_only" + LEAD_VOCAL_STEM_I = "with_lead_vocals" + BV_VOCAL_STEM_I = "with_backing_vocals" + LEAD_VOCAL_STEM_LABEL = "Lead Vocals" + BV_VOCAL_STEM_LABEL = "Backing Vocals" + + NON_ACCOM_STEMS = (VOCAL_STEM, OTHER_STEM, BASS_STEM, DRUM_STEM, GUITAR_STEM, PIANO_STEM, SYNTH_STEM, STRINGS_STEM, WOODWINDS_STEM, BRASS_STEM, WIND_INST_STEM) + + def __init__(self, config): + + self.logger: Logger = config.get("logger") + + # Inferencing device / acceleration config + self.torch_device = config.get("torch_device") + self.torch_device_cpu = config.get("torch_device_cpu") + self.torch_device_mps = config.get("torch_device_mps") + self.onnx_execution_provider = config.get("onnx_execution_provider") + + # Model data + self.model_name = config.get("model_name") + self.model_path = config.get("model_path") + self.model_data = config.get("model_data") + + # Optional custom output paths for the primary and secondary stems + # If left as None, the arch-specific class decides the output filename, e.g. something like: + # f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}" + self.primary_stem_output_path = config.get("primary_stem_output_path") + self.secondary_stem_output_path = config.get("secondary_stem_output_path") + + # Output directory and format + self.output_dir = config.get("output_dir") + self.output_format = config.get("output_format") + + # Functional options which are applicable to all architectures and the user may tweak to affect the output + self.normalization_threshold = config.get("normalization_threshold") + self.enable_denoise = config.get("enable_denoise") + self.output_single_stem = config.get("output_single_stem") + self.invert_using_spec = config.get("invert_using_spec") + self.sample_rate = config.get("sample_rate") + + # Model specific properties + self.primary_stem_name = self.model_data["primary_stem"] + self.secondary_stem_name = "Vocals" if self.primary_stem_name == "Instrumental" else "Instrumental" + self.is_karaoke = self.model_data.get("is_karaoke", False) + self.is_bv_model = self.model_data.get("is_bv_model", False) + self.bv_model_rebalance = self.model_data.get("is_bv_model_rebalanced", 0) + + # In UVR, these variables are set but either aren't useful or are better handled in audio-separator. + # Leaving these comments explaining to help myself or future developers understand why these aren't in audio-separator. + + # "chunks" is not actually used for anything in UVR... + # self.chunks = 0 + + # "adjust" is hard-coded to 1 in UVR, and only used as a multiplier in run_model, so it does nothing. + # self.adjust = 1 + + # "hop" is hard-coded to 1024 in UVR. We have a "hop_length" parameter instead + # self.hop = 1024 + + # "margin" maps to sample rate and is set from the GUI in UVR (default: 44100). We have a "sample_rate" parameter instead. + # self.margin = 44100 + + # "dim_c" is hard-coded to 4 in UVR, seems to be a parameter for the number of channels, and is only used for checkpoint models. + # We haven't implemented support for the checkpoint models here, so we're not using it. + # self.dim_c = 4 + + self.logger.debug(f"Common params: model_name={self.model_name}, model_path={self.model_path}") + self.logger.debug(f"Common params: primary_stem_output_path={self.primary_stem_output_path}, secondary_stem_output_path={self.secondary_stem_output_path}") + self.logger.debug(f"Common params: output_dir={self.output_dir}, output_format={self.output_format}") + self.logger.debug(f"Common params: normalization_threshold={self.normalization_threshold}") + self.logger.debug(f"Common params: enable_denoise={self.enable_denoise}, output_single_stem={self.output_single_stem}") + self.logger.debug(f"Common params: invert_using_spec={self.invert_using_spec}, sample_rate={self.sample_rate}") + + self.logger.debug(f"Common params: primary_stem_name={self.primary_stem_name}, secondary_stem_name={self.secondary_stem_name}") + self.logger.debug(f"Common params: is_karaoke={self.is_karaoke}, is_bv_model={self.is_bv_model}, bv_model_rebalance={self.bv_model_rebalance}") + + self.cached_sources_map = {} + + def separate(self, audio_file_path): + """ + Placeholder method for separating audio sources. Should be overridden by subclasses. + """ + raise NotImplementedError("This method should be overridden by subclasses.") + + def final_process(self, stem_path, source, stem_name): + """ + Finalizes the processing of a stem by writing the audio to a file and returning the processed source. + """ + self.logger.debug(f"Finalizing {stem_name} stem processing and writing audio...") + self.write_audio(stem_path, source) + + return {stem_name: source} + + def cached_sources_clear(self): + """ + Clears the cache dictionaries for VR, MDX, and Demucs models. + + This function is essential for ensuring that the cache does not hold outdated or irrelevant data + between different processing sessions or when a new batch of audio files is processed. + It helps in managing memory efficiently and prevents potential errors due to stale data. + """ + self.cached_sources_map = {} + + def cached_source_callback(self, model_architecture, model_name=None): + """ + Retrieves the model and sources from the cache based on the processing method and model name. + + Args: + model_architecture: The architecture type (VR, MDX, or Demucs) being used for processing. + model_name: The specific model name within the architecture type, if applicable. + + Returns: + A tuple containing the model and its sources if found in the cache; otherwise, None. + + This function is crucial for optimizing performance by avoiding redundant processing. + If the requested model and its sources are already in the cache, they can be reused directly, + saving time and computational resources. + """ + model, sources = None, None + + mapper = self.cached_sources_map[model_architecture] + + for key, value in mapper.items(): + if model_name in key: + model = key + sources = value + + return model, sources + + def cached_model_source_holder(self, model_architecture, sources, model_name=None): + """ + Update the dictionary for the given model_architecture with the new model name and its sources. + Use the model_architecture as a key to access the corresponding cache source mapper dictionary. + """ + self.cached_sources_map[model_architecture] = {**self.cached_sources_map.get(model_architecture, {}), **{model_name: sources}} + + def write_audio(self, stem_path: str, stem_source): + """ + Writes the separated audio source to a file. + """ + self.logger.debug(f"Entering write_audio with stem_path: {stem_path}") + + stem_source = spec_utils.normalize(wave=stem_source, max_peak=self.normalization_threshold) + + # Check if the numpy array is empty or contains very low values + if np.max(np.abs(stem_source)) < 1e-6: + self.logger.warning("Warning: stem_source array is near-silent or empty.") + return + + # If output_dir is specified, create it and join it with stem_path + if self.output_dir: + os.makedirs(self.output_dir, exist_ok=True) + stem_path = os.path.join(self.output_dir, stem_path) + + self.logger.debug(f"Audio data shape before processing: {stem_source.shape}") + self.logger.debug(f"Data type before conversion: {stem_source.dtype}") + + # Ensure the audio data is in the correct format (e.g., int16) + if stem_source.dtype != np.int16: + stem_source = (stem_source * 32767).astype(np.int16) + self.logger.debug("Converted stem_source to int16.") + + # Correctly interleave stereo channels + stem_source_interleaved = np.empty((2 * stem_source.shape[0],), dtype=np.int16) + stem_source_interleaved[0::2] = stem_source[:, 0] # Left channel + stem_source_interleaved[1::2] = stem_source[:, 1] # Right channel + + self.logger.debug(f"Interleaved audio data shape: {stem_source_interleaved.shape}") + + # Create a pydub AudioSegment + try: + audio_segment = AudioSegment(stem_source_interleaved.tobytes(), frame_rate=self.sample_rate, sample_width=stem_source.dtype.itemsize, channels=2) + self.logger.debug("Created AudioSegment successfully.") + except (IOError, ValueError) as e: + self.logger.error(f"Specific error creating AudioSegment: {e}") + return + + # Determine file format based on the file extension + file_format = stem_path.lower().split(".")[-1] + + # For m4a files, specify mp4 as the container format as the extension doesn't match the format name + if file_format == "m4a": + file_format = "mp4" + elif file_format == "mka": + file_format = "matroska" + + # Export using the determined format + try: + audio_segment.export(stem_path, format=file_format) + self.logger.debug(f"Exported audio file successfully to {stem_path}") + except (IOError, ValueError) as e: + self.logger.error(f"Error exporting audio file: {e}") diff --git a/audio_separator/separator/separator.py b/audio_separator/separator/separator.py index 2b4263c..e8c009f 100644 --- a/audio_separator/separator/separator.py +++ b/audio_separator/separator/separator.py @@ -1,3 +1,6 @@ +""" This file contains the Separator class, to facilitate the separation of stems from audio. """ + +from importlib import metadata import os import gc import platform @@ -8,36 +11,67 @@ import warnings import requests import torch -import librosa -import numpy as np import onnxruntime as ort -from importlib import metadata -from onnx2torch import convert -from pydub import AudioSegment -from audio_separator.separator import spec_utils -from audio_separator.separator.stft import STFT -from tqdm import tqdm +from audio_separator.separator.architectures import MDXSeparator, VRSeparator + class Separator: + """ + The Separator class is designed to facilitate the separation of audio sources from a given audio file. + It supports various separation architectures and models, including MDX and VR. The class provides + functionalities to configure separation parameters, load models, and perform audio source separation. + It also handles logging, normalization, and output formatting of the separated audio stems. + + The actual separation task is handled by one of the architecture-specific classes in the `architectures` module; + this class is responsible for initialising logging, configuring hardware acceleration, loading the model, + initiating the separation process and passing outputs back to the caller. + + Common Attributes: + log_level (int): The logging level. + log_formatter (logging.Formatter): The logging formatter. + model_file_dir (str): The directory where model files are stored. + output_dir (str): The directory where output files will be saved. + primary_stem_output_path (str): The path for saving the primary stem. + secondary_stem_output_path (str): The path for saving the secondary stem. + output_format (str): The format of the output audio file. + normalization_threshold (float): The threshold for audio normalization. + enable_denoise (bool): Flag to enable or disable denoising. + output_single_stem (str): Option to output a single stem. + invert_using_spec (bool): Flag to invert using spectrogram. + sample_rate (int): The sample rate of the audio. + + MDX Architecture Specific Attributes: + hop_length (int): The hop length for STFT. + segment_size (int): The segment size for processing. + overlap (float): The overlap between segments. + batch_size (int): The batch size for processing. + + VR Architecture Specific Attributes & Defaults: + batch_size: 16 + window_size: 512 + aggression: 5 + enable_tta: False + enable_post_process: False + post_process_threshold: 0.2 + high_end_process: False + """ + def __init__( self, log_level=logging.DEBUG, log_formatter=None, model_file_dir="/tmp/audio-separator-models/", output_dir=None, - primary_stem_path=None, - secondary_stem_path=None, + primary_stem_output_path=None, + secondary_stem_output_path=None, output_format="WAV", - output_subtype=None, normalization_threshold=0.9, - denoise_enabled=False, + enable_denoise=False, output_single_stem=None, invert_using_spec=False, sample_rate=44100, - hop_length=1024, - segment_size=256, - overlap=0.25, - batch_size=1, + mdx_params={"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1}, + vr_params={"batch_size": 16, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False}, ): self.logger = logging.getLogger(__name__) self.logger.setLevel(log_level) @@ -64,143 +98,148 @@ def __init__( self.model_file_dir = model_file_dir self.output_dir = output_dir - self.primary_stem_path = primary_stem_path - self.secondary_stem_path = secondary_stem_path + + # Allow the user to specify the output paths for the primary and secondary stems + # If left as None, the arch-specific class decides the output filename, typically e.g. something like: + # f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}" + self.primary_stem_output_path = primary_stem_output_path + self.secondary_stem_output_path = secondary_stem_output_path # Create the model directory if it does not exist os.makedirs(self.model_file_dir, exist_ok=True) - self.output_subtype = output_subtype self.output_format = output_format if self.output_format is None: self.output_format = "WAV" - if self.output_subtype is None and output_format == "WAV": - self.output_subtype = "PCM_16" - self.normalization_threshold = normalization_threshold - self.logger.debug( - f"Normalization threshold set to {normalization_threshold}, waveform will lowered to this max amplitude to avoid clipping." - ) + self.logger.debug(f"Normalization threshold set to {normalization_threshold}, waveform will lowered to this max amplitude to avoid clipping.") - self.denoise_enabled = denoise_enabled - if self.denoise_enabled: + self.enable_denoise = enable_denoise + if self.enable_denoise: self.logger.debug(f"Denoising enabled, model will be run twice to reduce noise in output audio.") else: - self.logger.debug( - f"Denoising disabled, model will only be run once. This is twice as fast, but may result in noisier output audio." - ) + self.logger.debug(f"Denoising disabled, model will only be run once. This is twice as fast, but may result in noisier output audio.") self.output_single_stem = output_single_stem if output_single_stem is not None: if output_single_stem.lower() not in {"instrumental", "vocals"}: - raise Exception("output_single_stem must be either 'instrumental' or 'vocals'") + raise ValueError("output_single_stem must be either 'instrumental' or 'vocals'") self.logger.debug(f"Single stem output requested, only one output file ({output_single_stem}) will be written") self.invert_using_spec = invert_using_spec if self.invert_using_spec: - self.logger.debug( - f"Secondary step will be inverted using spectogram rather than waveform. This may improve quality, but is slightly slower." - ) + self.logger.debug(f"Secondary step will be inverted using spectogram rather than waveform. This may improve quality, but is slightly slower.") self.sample_rate = sample_rate - self.hop_length = hop_length - self.segment_size = segment_size - self.overlap = overlap - self.batch_size = batch_size - self.logger.debug( - f"Separation settings set: sample_rate={self.sample_rate}, hop_length={self.hop_length}, segment_size={self.segment_size}, overlap={self.overlap}, batch_size={self.batch_size}" - ) - self.setup_inferencing_device() + # These are parameters which users may want to configure so we expose them to the top-level Separator class, + # even though they are specific to a single model architecture + self.arch_specific_params = {"MDX": mdx_params, "VR": vr_params} + + self.torch_device = None + self.torch_device_cpu = None + self.torch_device_mps = None + + self.onnx_execution_provider = None + self.model_instance = None + self.audio_file_path = None + self.audio_file_base = None + self.primary_source = None + self.secondary_source = None + + self.setup_accelerated_inferencing_device() - def setup_inferencing_device(self): - self.logger.info(f"Checking hardware specifics to configure acceleration") + def setup_accelerated_inferencing_device(self): + """ + This method sets up the PyTorch and/or ONNX Runtime inferencing device, using GPU hardware acceleration if available. + """ + self.log_system_info() + self.log_onnxruntime_packages() + self.setup_torch_device() + def log_system_info(self): + """ + This method logs the system information, including the operating system, CPU archutecture and Python version + """ os_name = platform.system() os_version = platform.version() self.logger.info(f"Operating System: {os_name} {os_version}") system_info = platform.uname() - self.logger.info( - f"System: {system_info.system} Node: {system_info.node} Release: {system_info.release} Machine: {system_info.machine} Proc: {system_info.processor}" - ) + self.logger.info(f"System: {system_info.system} Node: {system_info.node} Release: {system_info.release} Machine: {system_info.machine} Proc: {system_info.processor}") python_version = platform.python_version() self.logger.info(f"Python Version: {python_version}") + def log_onnxruntime_packages(self): + """ + This method logs the ONNX Runtime package versions, including the GPU and Silicon packages if available. + """ onnxruntime_gpu_package = self.get_package_distribution("onnxruntime-gpu") + onnxruntime_silicon_package = self.get_package_distribution("onnxruntime-silicon") + onnxruntime_cpu_package = self.get_package_distribution("onnxruntime") + if onnxruntime_gpu_package is not None: self.logger.info(f"ONNX Runtime GPU package installed with version: {onnxruntime_gpu_package.version}") - - onnxruntime_silicon_package = self.get_package_distribution("onnxruntime-silicon") if onnxruntime_silicon_package is not None: self.logger.info(f"ONNX Runtime Silicon package installed with version: {onnxruntime_silicon_package.version}") - - onnxruntime_cpu_package = self.get_package_distribution("onnxruntime") if onnxruntime_cpu_package is not None: self.logger.info(f"ONNX Runtime CPU package installed with version: {onnxruntime_cpu_package.version}") - torch_package = self.get_package_distribution("torch") - if torch_package is not None: - self.logger.info(f"Torch package installed with version: {torch_package.version}") - - torchvision_package = self.get_package_distribution("torchvision") - if torchvision_package is not None: - self.logger.info(f"Torchvision package installed with version: {torchvision_package.version}") - - torchaudio_package = self.get_package_distribution("torchaudio") - if torchaudio_package is not None: - self.logger.info(f"Torchaudio package installed with version: {torchaudio_package.version}") - - ort_device = ort.get_device() + def setup_torch_device(self): + """ + This method sets up the PyTorch and/or ONNX Runtime inferencing device, using GPU hardware acceleration if available. + """ + hardware_acceleration_enabled = False ort_providers = ort.get_available_providers() - self.cpu = torch.device("cpu") - hardware_acceleration_enabled = False + self.torch_device_cpu = torch.device("cpu") - # Prepare for hardware-accelerated inference by validating both Torch and ONNX Runtime support either CUDA or CoreML if torch.cuda.is_available(): - self.logger.info("CUDA is available in Torch, setting Torch device to CUDA") - self.device = torch.device("cuda") - - if onnxruntime_gpu_package is not None and ort_device == "GPU" and "CUDAExecutionProvider" in ort_providers: - self.logger.info("ONNXruntime has CUDAExecutionProvider available, enabling acceleration") - self.onnx_execution_provider = ["CUDAExecutionProvider"] - hardware_acceleration_enabled = True - else: - self.logger.warning("CUDAExecutionProvider not available in ONNXruntime, so acceleration will NOT be enabled") - self.logger.warning("If you expect CUDA to work with your GPU, try pip install --force-reinstall onnxruntime-gpu") - else: - self.logger.debug("CUDA not available in Torch installation. If you expect GPU/CUDA support to work, please see README") + self.configure_cuda(ort_providers) + hardware_acceleration_enabled = True + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + self.configure_mps(ort_providers) + hardware_acceleration_enabled = True - if onnxruntime_silicon_package is not None and hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): - self.logger.info("Apple Silicon MPS/CoreML is available in Torch, setting Torch device to MPS") + if not hardware_acceleration_enabled: + self.logger.info("No hardware acceleration could be configured, running in CPU mode") + self.torch_device = self.torch_device_cpu + self.onnx_execution_provider = ["CPUExecutionProvider"] - # TODO: Change this to use MPS once FFTs are supported, see https://github.com/pytorch/pytorch/issues/78044 - # self.device = torch.device("mps") + def configure_cuda(self, ort_providers): + """ + This method configures the CUDA device for PyTorch and ONNX Runtime, if available. + """ + self.logger.info("CUDA is available in Torch, setting Torch device to CUDA") + self.torch_device = torch.device("cuda") + if "CUDAExecutionProvider" in ort_providers: + self.logger.info("ONNXruntime has CUDAExecutionProvider available, enabling acceleration") + self.onnx_execution_provider = ["CUDAExecutionProvider"] + else: + self.logger.warning("CUDAExecutionProvider not available in ONNXruntime, so acceleration will NOT be enabled") - self.logger.warning("Torch MPS backend does not yet support FFT operations, Torch will still use CPU!") - self.logger.warning("To track progress towards Apple Silicon acceleration, see https://github.com/pytorch/pytorch/issues/78044") - self.device = torch.device("cpu") + def configure_mps(self, ort_providers): + """ + This method configures the Apple Silicon MPS/CoreML device for PyTorch and ONNX Runtime, if available. + """ + self.logger.info("Apple Silicon MPS/CoreML is available in Torch, setting Torch device to MPS") + self.torch_device_mps = torch.device("mps") - if "CoreMLExecutionProvider" in ort_providers: - self.logger.info("ONNXruntime has CoreMLExecutionProvider available, enabling acceleration") - self.onnx_execution_provider = ["CoreMLExecutionProvider"] - hardware_acceleration_enabled = True - else: - self.logger.warning("CoreMLExecutionProvider not available in ONNXruntime, so acceleration will NOT be enabled") - self.logger.warning("If you expect MPS/CoreML to work with your Mac, try pip install --force-reinstall onnxruntime-silicon") - else: - self.logger.debug("Apple Silicon MPS/CoreML not available in Torch installation. If you expect this to work, please see README") + self.torch_device = self.torch_device_mps - if not hardware_acceleration_enabled: - self.logger.info("No hardware acceleration could be configured, running in CPU mode") - self.device = torch.device("cpu") - self.onnx_execution_provider = ["CPUExecutionProvider"] + if "CoreMLExecutionProvider" in ort_providers: + self.logger.info("ONNXruntime has CoreMLExecutionProvider available, enabling acceleration") + self.onnx_execution_provider = ["CoreMLExecutionProvider"] + else: + self.logger.warning("CoreMLExecutionProvider not available in ONNXruntime, so acceleration will NOT be enabled") def get_package_distribution(self, package_name): + """ + This method returns the package distribution for a given package name if installed, or None otherwise. + """ try: return metadata.distribution(package_name) except metadata.PackageNotFoundError: @@ -208,15 +247,30 @@ def get_package_distribution(self, package_name): return None def get_model_hash(self, model_path): + """ + This method returns the MD5 hash of a given model file. + """ + + self.logger.error(f"Attempting to calculate hash of model file {model_path}") try: + # Open the model file in binary read mode with open(model_path, "rb") as f: + # Move the file pointer 10MB before the end of the file f.seek(-10000 * 1024, 2) + # Read the file from the current pointer to the end and calculate its MD5 hash return hashlib.md5(f.read()).hexdigest() - except: + except IOError as e: + # If an IOError occurs (e.g., if the file is less than 10MB large), log the error + self.logger.error(f"IOError seeking -10MB or reading model file for hash calculation: {e}") + # Attempt to open the file again, read its entire content, and calculate the MD5 hash return hashlib.md5(open(model_path, "rb").read()).hexdigest() def download_file(self, url, output_path): - response = requests.get(url, stream=True) + """ + This method downloads a file from a given URL to a given output path. + """ + self.logger.debug(f"Downloading file from {url} to {output_path} with timeout 300s") + response = requests.get(url, stream=True, timeout=300) if response.status_code == 200: with open(output_path, "wb") as f: @@ -225,191 +279,269 @@ def download_file(self, url, output_path): else: self.logger.error(f"Failed to download file from {url}") - def final_process(self, stem_path, source, stem_name, sample_rate): - self.logger.debug(f"Finalizing {stem_name} stem processing and writing audio...") - self.write_audio(stem_path, source, sample_rate, stem_name=stem_name) - - return {stem_name: source} - def clear_gpu_cache(self): + """ + This method clears the GPU cache to free up memory. + """ self.logger.debug("Running garbage collection...") gc.collect() - if self.device == torch.device("mps"): + if self.torch_device == torch.device("mps"): self.logger.debug("Clearing MPS cache...") torch.mps.empty_cache() - if self.device == torch.device("cuda"): + if self.torch_device == torch.device("cuda"): self.logger.debug("Clearing CUDA cache...") torch.cuda.empty_cache() - def load_model(self, model_name="UVR-MDX-NET-Inst_HQ_3"): - self.logger.info(f"Loading model {model_name}...") - - self.load_model_start_time = time.perf_counter() - - self.model_name = model_name - self.model_url = f"https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/{self.model_name}.onnx" - self.model_data_url = "https://raw.githubusercontent.com/TRvlvr/application_data/main/mdx_model_data/model_data.json" + def list_supported_model_files(self): + """ + This method lists the supported model files for audio-separator, by fetching the same file UVR uses to list these. + """ + download_checks_path = os.path.join(self.model_file_dir, "download_checks.json") + + if not os.path.isfile(download_checks_path): + self.download_file("https://raw.githubusercontent.com/TRvlvr/application_data/main/filelists/download_checks.json", download_checks_path) + + model_downloads_list = json.load(open(download_checks_path, encoding="utf-8")) + self.logger.debug(f"Model download list loaded: {model_downloads_list}") + + # model_downloads_list JSON structure / example snippet: + # { + # "vr_download_list": { + # "VR Arch Single Model v5: 1_HP-UVR": "1_HP-UVR.pth", + # "VR Arch Single Model v5: UVR-DeNoise by FoxJoy": "UVR-DeNoise.pth", + # }, + # "mdx_download_list": { + # "MDX-Net Model: UVR-MDX-NET Inst HQ 3": "UVR-MDX-NET-Inst_HQ_3.onnx", + # "MDX-Net Model: UVR-MDX-NET Karaoke 2": "UVR_MDXNET_KARA_2.onnx", + # "MDX-Net Model: Kim Vocal 2": "Kim_Vocal_2.onnx", + # "MDX-Net Model: kuielab_b_drums": "kuielab_b_drums.onnx" + # }, + # "demucs_download_list": { + # "Demucs v4: htdemucs_ft": { + # "f7e0c4bc-ba3fe64a.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/f7e0c4bc-ba3fe64a.th", + # "d12395a8-e57c48e6.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/d12395a8-e57c48e6.th", + # "92cfc3b6-ef3bcb9c.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/92cfc3b6-ef3bcb9c.th", + # "04573f0d-f3cf25b2.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/04573f0d-f3cf25b2.th", + # "htdemucs_ft.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs_ft.yaml" + # }, + # "Demucs v4: htdemucs": { + # "955717e8-8726e21a.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/955717e8-8726e21a.th", + # "htdemucs.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs.yaml" + # }, + # "Demucs v1: tasnet": { + # "tasnet.th": "https://dl.fbaipublicfiles.com/demucs/v2.0/tasnet.th" + # }, + # }, + # "mdx23_download_list": { + # "MDX23C Model: MDX23C_D1581": { + # "MDX23C_D1581.ckpt": "model_2_stem_061321.yaml" + # } + # }, + # "mdx23c_download_list": { + # "MDX23C Model: MDX23C-InstVoc HQ": { + # "MDX23C-8KFFT-InstVoc_HQ.ckpt": "model_2_stem_full_band_8k.yaml" + # } + # } + # } + + # Return object with list of model names, which are the keys in vr_download_list, mdx_download_list, demucs_download_list, mdx23_download_list, mdx23c_download_list, grouped by type: VR, MDX, Demucs, MDX23, MDX23C + model_files_grouped = { + "VR": model_downloads_list["vr_download_list"], + "MDX": model_downloads_list["mdx_download_list"], + # "Demucs": list(model_downloads_list["demucs_download_list"].keys()), + # "MDX23": list(model_downloads_list["mdx23_download_list"].keys()), + # "MDX23C": list(model_downloads_list["mdx23c_download_list"].keys()) + } + return model_files_grouped + + def load_model(self, model_filename="2_HP-UVR.pth"): + """ + This method loads the separation model into memory, downloading it first if necessary. + """ + self.logger.info(f"Loading model {model_filename}...") + + load_model_start_time = time.perf_counter() + + # Model data and configuration sources from UVR + model_repo_url_prefix = "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models" + model_data_url_prefix = "https://raw.githubusercontent.com/TRvlvr/application_data/main" + vr_model_data_url = f"{model_data_url_prefix}/vr_model_data/model_data_new.json" + mdx_model_data_url = f"{model_data_url_prefix}/mdx_model_data/model_data_new.json" # Setting up the model path - model_path = os.path.join(self.model_file_dir, f"{self.model_name}.onnx") + model_name = model_filename.split(".")[0] + model_path = os.path.join(self.model_file_dir, f"{model_filename}") self.logger.debug(f"Model path set to {model_path}") # Check if model file exists, if not, download it if not os.path.isfile(model_path): self.logger.debug(f"Model not found at path {model_path}, downloading...") - self.download_file(self.model_url, model_path) + self.download_file(f"{model_repo_url_prefix}/{model_filename}", model_path) - # Reading model settings from the downloaded model - self.logger.debug("Reading model settings...") + # Calculating hash for the downloaded model + self.logger.debug("Calculating MD5 hash for model file to identify model parameters from UVR data...") model_hash = self.get_model_hash(model_path) self.logger.debug(f"Model {model_path} has hash {model_hash}") # Setting up the path for model data and checking its existence - model_data_path = os.path.join(self.model_file_dir, "model_data.json") - self.logger.debug(f"Model data path set to {model_data_path}") - if not os.path.isfile(model_data_path): - self.logger.debug(f"Model data not found at path {model_data_path}, downloading...") - self.download_file(self.model_data_url, model_data_path) + vr_model_data_path = os.path.join(self.model_file_dir, "vr_model_data.json") + self.logger.debug(f"VR model data path set to {vr_model_data_path}") + if not os.path.isfile(vr_model_data_path): + self.logger.debug(f"VR model data not found at path {vr_model_data_path}, downloading...") + self.download_file(vr_model_data_url, vr_model_data_path) + + mdx_model_data_path = os.path.join(self.model_file_dir, "mdx_model_data.json") + self.logger.debug(f"MDX model data path set to {mdx_model_data_path}") + if not os.path.isfile(mdx_model_data_path): + self.logger.debug(f"MDX model data not found at path {mdx_model_data_path}, downloading...") + self.download_file(mdx_model_data_url, mdx_model_data_path) # Loading model data - self.logger.debug("Loading model data...") - model_data_object = json.load(open(model_data_path)) - model_data = model_data_object[model_hash] + self.logger.debug("Loading MDX and VR model parameters from UVR model data files...") + vr_model_data_object = json.load(open(vr_model_data_path, encoding="utf-8")) + mdx_model_data_object = json.load(open(mdx_model_data_path, encoding="utf-8")) + + # vr_model_data_object JSON structure / example snippet: + # { + # "0d0e6d143046b0eecc41a22e60224582": { + # "vr_model_param": "3band_44100_mid", + # "primary_stem": "Instrumental" + # }, + # "6b5916069a49be3fe29d4397ecfd73fa": { + # "vr_model_param": "3band_44100_msb2", + # "primary_stem": "Instrumental", + # "is_karaoke": true + # }, + # "0ec76fd9e65f81d8b4fbd13af4826ed8": { + # "vr_model_param": "4band_v3", + # "primary_stem": "No Woodwinds" + # }, + # "0fb9249ffe4ffc38d7b16243f394c0ff": { + # "vr_model_param": "4band_v3", + # "primary_stem": "No Reverb" + # }, + # "6857b2972e1754913aad0c9a1678c753": { + # "vr_model_param": "4band_v3", + # "primary_stem": "No Echo", + # "nout": 48, + # "nout_lstm": 128 + # }, + # "944950a9c5963a5eb70b445d67b7068a": { + # "vr_model_param": "4band_v3_sn", + # "primary_stem": "Vocals", + # "nout": 64, + # "nout_lstm": 128, + # "is_karaoke": false, + # "is_bv_model": true, + # "is_bv_model_rebalanced": 0.9 + # } + # } + + # mdx_model_data_object JSON structure / example snippet: + # { + # "0ddfc0eb5792638ad5dc27850236c246": { + # "compensate": 1.035, + # "mdx_dim_f_set": 2048, + # "mdx_dim_t_set": 8, + # "mdx_n_fft_scale_set": 6144, + # "primary_stem": "Vocals" + # }, + # "26d308f91f3423a67dc69a6d12a8793d": { + # "compensate": 1.035, + # "mdx_dim_f_set": 2048, + # "mdx_dim_t_set": 9, + # "mdx_n_fft_scale_set": 8192, + # "primary_stem": "Other" + # }, + # "2cdd429caac38f0194b133884160f2c6": { + # "compensate": 1.045, + # "mdx_dim_f_set": 3072, + # "mdx_dim_t_set": 8, + # "mdx_n_fft_scale_set": 7680, + # "primary_stem": "Instrumental" + # }, + # "2f5501189a2f6db6349916fabe8c90de": { + # "compensate": 1.035, + # "mdx_dim_f_set": 2048, + # "mdx_dim_t_set": 8, + # "mdx_n_fft_scale_set": 6144, + # "primary_stem": "Vocals", + # "is_karaoke": true + # }, + # "2154254ee89b2945b97a7efed6e88820": { + # "config_yaml": "model_2_stem_061321.yaml" + # }, + # "116f6f9dabb907b53d847ed9f7a9475f": { + # "config_yaml": "model_2_stem_full_band_8k.yaml" + # } + # } + + if model_hash in mdx_model_data_object: + model_data = mdx_model_data_object[model_hash] + model_type = "MDX" + elif model_hash in vr_model_data_object: + model_data = vr_model_data_object[model_hash] + model_type = "VR" + else: + raise ValueError(f"Unsupported Model File: parameters for MD5 hash {model_hash} could not be found in the UVR model data file.") + self.logger.debug(f"Model data loaded: {model_data}") - # Initializing model parameters - self.compensate, self.dim_f, self.dim_t, self.n_fft, self.model_primary_stem = ( - model_data["compensate"], - model_data["mdx_dim_f_set"], - 2 ** model_data["mdx_dim_t_set"], - model_data["mdx_n_fft_scale_set"], - model_data["primary_stem"], - ) - self.model_secondary_stem = "Vocals" if self.model_primary_stem == "Instrumental" else "Instrumental" - - # In UVR, these variables are set but either aren't useful or are better handled in audio-separator. - # Leaving these comments explaining to help myself or future developers understand why these aren't in audio-separator. - - # "chunks" is not actually used for anything in UVR... - # self.chunks = 0 - - # "adjust" is hard-coded to 1 in UVR, and only used as a multiplier in run_model, so it does nothing. - # self.adjust = 1 - - # "hop" is hard-coded to 1024 in UVR. We have a "hop_length" parameter instead - # self.hop = 1024 - - # "margin" maps to sample rate and is set from the GUI in UVR (default: 44100). We have a "sample_rate" parameter instead. - # self.margin = 44100 - - # "dim_c" is hard-coded to 4 in UVR, seems to be a parameter for the number of channels, and is only used for checkpoint models. - # We haven't implemented support for the checkpoint models here, so we're not using it. - # self.dim_c = 4 - - self.logger.debug(f"Model params: primary_stem={self.model_primary_stem}, secondary_stem={self.model_secondary_stem}") - self.logger.debug( - f"Model params: batch_size={self.batch_size}, compensate={self.compensate}, segment_size={self.segment_size}, dim_f={self.dim_f}, dim_t={self.dim_t}" - ) - self.logger.debug(f"Model params: n_fft={self.n_fft}, hop={self.hop_length}") - - # Loading the model for inference - self.logger.debug("Loading ONNX model for inference...") - if self.segment_size == self.dim_t: - ort_ = ort.InferenceSession(model_path, providers=self.onnx_execution_provider) - self.model_run = lambda spek: ort_.run(None, {"input": spek.cpu().numpy()})[0] - self.logger.debug("Model loaded successfully using ONNXruntime inferencing session.") + common_params = { + "logger": self.logger, + "torch_device": self.torch_device, + "torch_device_cpu": self.torch_device_cpu, + "torch_device_mps": self.torch_device_mps, + "onnx_execution_provider": self.onnx_execution_provider, + "model_name": model_name, + "model_path": model_path, + "model_data": model_data, + "primary_stem_output_path": self.primary_stem_output_path, + "secondary_stem_output_path": self.secondary_stem_output_path, + "output_format": self.output_format, + "output_dir": self.output_dir, + "normalization_threshold": self.normalization_threshold, + "enable_denoise": self.enable_denoise, + "output_single_stem": self.output_single_stem, + "invert_using_spec": self.invert_using_spec, + "sample_rate": self.sample_rate, + } + + if model_type == "MDX": + self.model_instance = MDXSeparator(common_config=common_params, arch_config=self.arch_specific_params["MDX"]) + elif model_type == "VR": + self.model_instance = VRSeparator(common_config=common_params, arch_config=self.arch_specific_params["VR"]) else: - self.model_run = convert(model_path) - self.model_run.to(self.device).eval() - self.logger.warning("Model converted from onnx to pytorch due to segment size not matching dim_t, processing may be slower.") + raise ValueError(f"Unsupported model type: {model_type}") - # Log the completion of the separation process + # Log the completion of the model load process self.logger.debug("Loading model completed.") - self.logger.info( - f'Load model duration: {time.strftime("%H:%M:%S", time.gmtime(int(time.perf_counter() - self.load_model_start_time)))}' - ) + self.logger.info(f'Load model duration: {time.strftime("%H:%M:%S", time.gmtime(int(time.perf_counter() - load_model_start_time)))}') def separate(self, audio_file_path): + """ + Separates the audio file into different stems (e.g., vocals, instruments) using the loaded model. + + This method takes the path to an audio file, processes it through the loaded separation model, and returns + the paths to the output files containing the separated audio stems. It handles the entire flow from loading + the audio, running the separation, clearing up resources, and logging the process. + + Parameters: + - audio_file_path (str): The path to the audio file to be separated. + + Returns: + - output_files (list of str): A list containing the paths to the separated audio stem files. + """ # Starting the separation process self.logger.info(f"Starting separation process for audio_file_path: {audio_file_path}") - self.separate_start_time = time.perf_counter() - - self.primary_source = None - self.secondary_source = None + separate_start_time = time.perf_counter() - self.audio_file_path = audio_file_path - self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0] - - # Prepare the mix for processing - self.logger.debug("Preparing mix...") - mix = self.prepare_mix(self.audio_file_path) - - self.logger.debug("Normalizing mix before demixing...") - mix = spec_utils.normalize(self.logger, wave=mix, max_peak=self.normalization_threshold) - - # Start the demixing process - source = self.demix(mix) - - # In UVR, the source is cached here if it's a vocal split model, but we're not supporting that yet - - # Initialize the list for output files - output_files = [] - self.logger.debug("Processing output files...") - - # Normalize and transpose the primary source if it's not already an array - if not isinstance(self.primary_source, np.ndarray): - self.logger.debug("Normalizing primary source...") - self.primary_source = spec_utils.normalize(self.logger, wave=source, max_peak=self.normalization_threshold).T - - # Process the secondary source if not already an array - if not isinstance(self.secondary_source, np.ndarray): - self.logger.debug("Producing secondary source: demixing in match_mix mode") - raw_mix = self.demix(mix, is_match_mix=True) - - if self.invert_using_spec: - self.logger.debug("Inverting secondary stem using spectogram as invert_using_spec is set to True") - self.secondary_source = spec_utils.invert_stem(raw_mix, source) - else: - self.logger.debug("Inverting secondary stem by subtracting of transposed demixed stem from transposed original mix") - self.secondary_source = mix.T - source.T - - # Save and process the secondary stem if needed - if not self.output_single_stem or self.output_single_stem.lower() == self.model_secondary_stem.lower(): - self.logger.info(f"Saving {self.model_secondary_stem} stem...") - if not self.secondary_stem_path: - self.secondary_stem_path = os.path.join( - f"{self.audio_file_base}_({self.model_secondary_stem})_{self.model_name}.{self.output_format.lower()}" - ) - self.secondary_source_map = self.final_process( - self.secondary_stem_path, self.secondary_source, self.model_secondary_stem, self.sample_rate - ) - output_files.append(self.secondary_stem_path) - - # Save and process the primary stem if needed - if not self.output_single_stem or self.output_single_stem.lower() == self.model_primary_stem.lower(): - self.logger.info(f"Saving {self.model_primary_stem} stem...") - if not self.primary_stem_path: - self.primary_stem_path = os.path.join( - f"{self.audio_file_base}_({self.model_primary_stem})_{self.model_name}.{self.output_format.lower()}" - ) - if not isinstance(self.primary_source, np.ndarray): - self.primary_source = source.T - self.primary_source_map = self.final_process( - self.primary_stem_path, self.primary_source, self.model_primary_stem, self.sample_rate - ) - output_files.append(self.primary_stem_path) + # Run separation method for the loaded model + output_files = self.model_instance.separate(audio_file_path) # Clear GPU cache to free up memory self.clear_gpu_cache() - # TODO: In UVR, this is where the vocal split chain gets processed - see process_vocal_split_chain() - - # Log the completion of the separation process - self.logger.debug("Separation process completed.") - self.logger.info( - f'Separation duration: {time.strftime("%H:%M:%S", time.gmtime(int(time.perf_counter() - self.separate_start_time)))}' - ) - # Unset the audio file to prevent accidental re-separation of the same file self.logger.debug("Clearing audio file...") self.audio_file_path = None @@ -419,324 +551,11 @@ def separate(self, audio_file_path): self.logger.debug("Clearing sources and stems...") self.primary_source = None self.secondary_source = None - self.primary_stem_path = None - self.secondary_stem_path = None + self.primary_stem_output_path = None + self.secondary_stem_output_path = None - return output_files - - def write_audio(self, stem_path: str, stem_source, sample_rate, stem_name=None): - self.logger.debug(f"Entering write_audio with stem_name: {stem_name} and stem_path: {stem_path}") - - stem_source = spec_utils.normalize(self.logger, wave=stem_source, max_peak=self.normalization_threshold) - - # Check if the numpy array is empty or contains very low values - if np.max(np.abs(stem_source)) < 1e-6: - self.logger.warning("Warning: stem_source array is near-silent or empty.") - return - - # If output_dir is specified, create it and join it with stem_path - if self.output_dir: - os.makedirs(self.output_dir, exist_ok=True) - stem_path = os.path.join(self.output_dir, stem_path) - - self.logger.debug(f"Audio data shape before processing: {stem_source.shape}") - self.logger.debug(f"Data type before conversion: {stem_source.dtype}") - - # Ensure the audio data is in the correct format (e.g., int16) - if stem_source.dtype != np.int16: - stem_source = (stem_source * 32767).astype(np.int16) - self.logger.debug("Converted stem_source to int16.") - - # Correctly interleave stereo channels - stem_source_interleaved = np.empty((2 * stem_source.shape[0],), dtype=np.int16) - stem_source_interleaved[0::2] = stem_source[:, 0] # Left channel - stem_source_interleaved[1::2] = stem_source[:, 1] # Right channel - - self.logger.debug(f"Interleaved audio data shape: {stem_source_interleaved.shape}") + # Log the completion of the separation process + self.logger.debug("Separation process completed.") + self.logger.info(f'Separation duration: {time.strftime("%H:%M:%S", time.gmtime(int(time.perf_counter() - separate_start_time)))}') - # Create a pydub AudioSegment - try: - audio_segment = AudioSegment( - stem_source_interleaved.tobytes(), frame_rate=self.sample_rate, sample_width=stem_source.dtype.itemsize, channels=2 - ) - self.logger.debug("Created AudioSegment successfully.") - except Exception as e: - self.logger.error(f"Error creating AudioSegment: {e}") - return - - # Determine file format based on the file extension - file_format = stem_path.lower().split(".")[-1] - - # For m4a files, specify mp4 as the container format as the extension doesn't match the format name - if file_format == "m4a": - file_format = "mp4" - elif file_format == "mka": - file_format = "matroska" - - # Export using the determined format - try: - audio_segment.export(stem_path, format=file_format) - self.logger.debug(f"Exported audio file successfully to {stem_path}") - except Exception as e: - self.logger.error(f"Error exporting audio file: {e}") - - # This function sets up the necessary parameters for the model, like the number of frequency bins (n_bins), the trimming size (trim), - # the size of each audio chunk (chunk_size), and the window function for spectral transformations (window). - # It ensures that the model is configured with the correct settings for processing the audio data. - def initialize_model_settings(self): - self.logger.debug("Initializing model settings...") - - # n_bins is half the FFT size plus one (self.n_fft // 2 + 1). - self.n_bins = self.n_fft // 2 + 1 - - # trim is half the FFT size (self.n_fft // 2). - self.trim = self.n_fft // 2 - - # chunk_size is the hop_length size times the segment size minus one - self.chunk_size = self.hop_length * (self.segment_size - 1) - - # gen_size is the chunk size minus twice the trim size - self.gen_size = self.chunk_size - 2 * self.trim - - self.stft = STFT(self.logger, self.n_fft, self.hop_length, self.dim_f, self.device) - - self.logger.debug(f"Model input params: n_fft={self.n_fft} hop_length={self.hop_length} dim_f={self.dim_f}") - self.logger.debug(f"Model settings: n_bins={self.n_bins}, trim={self.trim}, chunk_size={self.chunk_size}, gen_size={self.gen_size}") - - # After prepare_mix segments the audio, initialize_mix further processes each segment. - # It ensures each audio segment is in the correct format for the model, applies necessary padding, - # and converts the segments into tensors for processing with the model. - # This step is essential for preparing the audio data in a format that the neural network can process. - def initialize_mix(self, mix, is_ckpt=False): - # Log the initialization of the mix and whether checkpoint mode is used - self.logger.debug(f"Initializing mix with is_ckpt={is_ckpt}. Initial mix shape: {mix.shape}") - - # Ensure the mix is a 2-channel (stereo) audio signal - if mix.shape[0] != 2: - error_message = f"Expected a 2-channel audio signal, but got {mix.shape[0]} channels" - self.logger.error(error_message) - raise ValueError(error_message) - - # If in checkpoint mode, process the mix differently - if is_ckpt: - self.logger.debug("Processing in checkpoint mode...") - # Calculate padding based on the generation size and trim - pad = self.gen_size + self.trim - (mix.shape[-1] % self.gen_size) - self.logger.debug(f"Padding calculated: {pad}") - # Add padding at the beginning and the end of the mix - mixture = np.concatenate((np.zeros((2, self.trim), dtype="float32"), mix, np.zeros((2, pad), dtype="float32")), 1) - # Determine the number of chunks based on the mixture's length - num_chunks = mixture.shape[-1] // self.gen_size - self.logger.debug(f"Mixture shape after padding: {mixture.shape}, Number of chunks: {num_chunks}") - # Split the mixture into chunks - mix_waves = [mixture[:, i * self.gen_size : i * self.gen_size + self.chunk_size] for i in range(num_chunks)] - else: - # If not in checkpoint mode, process normally - self.logger.debug("Processing in non-checkpoint mode...") - mix_waves = [] - n_sample = mix.shape[1] - # Calculate necessary padding to make the total length divisible by the generation size - pad = self.gen_size - n_sample % self.gen_size - self.logger.debug(f"Number of samples: {n_sample}, Padding calculated: {pad}") - # Apply padding to the mix - mix_p = np.concatenate((np.zeros((2, self.trim)), mix, np.zeros((2, pad)), np.zeros((2, self.trim))), 1) - self.logger.debug(f"Shape of mix after padding: {mix_p.shape}") - - # Process the mix in chunks - i = 0 - while i < n_sample + pad: - waves = np.array(mix_p[:, i : i + self.chunk_size]) - mix_waves.append(waves) - self.logger.debug(f"Processed chunk {len(mix_waves)}: Start {i}, End {i + self.chunk_size}") - i += self.gen_size - - # Convert the list of wave chunks into a tensor for processing on the specified device - mix_waves_tensor = torch.tensor(mix_waves, dtype=torch.float32).to(self.device) - self.logger.debug(f"Converted mix_waves to tensor. Tensor shape: {mix_waves_tensor.shape}") - - return mix_waves_tensor, pad - - def demix(self, mix, is_match_mix=False): - self.logger.debug(f"Starting demixing process with is_match_mix: {is_match_mix}...") - self.initialize_model_settings() - - # Preserves the original mix for later use. - # In UVR, this is used for the pitch fix and VR denoise processes, which aren't yet implemented here. - org_mix = mix - self.logger.debug(f"Original mix stored. Shape: {org_mix.shape}") - - # Initializes a list to store the separated waveforms. - tar_waves_ = [] - - # Handling different chunk sizes and overlaps based on the matching requirement. - if is_match_mix: - # Sets a smaller chunk size specifically for matching the mix. - chunk_size = self.hop_length * (self.segment_size - 1) - # Sets a small overlap for the chunks. - overlap = 0.02 - self.logger.debug(f"Chunk size for matching mix: {chunk_size}, Overlap: {overlap}") - else: - # Uses the regular chunk size defined in model settings. - chunk_size = self.chunk_size - # Uses the overlap specified in the model settings. - overlap = self.overlap - self.logger.debug(f"Standard chunk size: {chunk_size}, Overlap: {overlap}") - - # Calculates the generated size after subtracting the trim from both ends of the chunk. - gen_size = chunk_size - 2 * self.trim - self.logger.debug(f"Generated size calculated: {gen_size}") - - # Calculates padding to make the mix length a multiple of the generated size. - pad = gen_size + self.trim - ((mix.shape[-1]) % gen_size) - # Prepares the mixture with padding at the beginning and the end. - mixture = np.concatenate((np.zeros((2, self.trim), dtype="float32"), mix, np.zeros((2, pad), dtype="float32")), 1) - self.logger.debug(f"Mixture prepared with padding. Mixture shape: {mixture.shape}") - - # Calculates the step size for processing chunks based on the overlap. - step = int((1 - overlap) * chunk_size) - self.logger.debug(f"Step size for processing chunks: {step} as overlap is set to {overlap}.") - - # Initializes arrays to store the results and to account for overlap. - result = np.zeros((1, 2, mixture.shape[-1]), dtype=np.float32) - divider = np.zeros((1, 2, mixture.shape[-1]), dtype=np.float32) - - # Initializes counters for processing chunks. - total = 0 - total_chunks = (mixture.shape[-1] + step - 1) // step - self.logger.debug(f"Total chunks to process: {total_chunks}") - - # Processes each chunk of the mixture. - for i in tqdm(range(0, mixture.shape[-1], step),desc="Processing chunk"): - total += 1 - start = i - end = min(i + chunk_size, mixture.shape[-1]) - self.logger.debug(f"Processing chunk {total}/{total_chunks}: Start {start}, End {end}") - - # Handles windowing for overlapping chunks. - chunk_size_actual = end - start - window = None - if overlap != 0: - window = np.hanning(chunk_size_actual) - window = np.tile(window[None, None, :], (1, 2, 1)) - self.logger.debug("Window applied to the chunk.") - - # Zero-pad the chunk to prepare it for processing. - mix_part_ = mixture[:, start:end] - if end != i + chunk_size: - pad_size = (i + chunk_size) - end - mix_part_ = np.concatenate((mix_part_, np.zeros((2, pad_size), dtype="float32")), axis=-1) - - # Converts the chunk to a tensor for processing. - mix_part = torch.tensor([mix_part_], dtype=torch.float32).to(self.device) - # Splits the chunk into smaller batches if necessary. - mix_waves = mix_part.split(self.batch_size) - total_batches = len(mix_waves) - self.logger.debug(f"Mix part split into batches. Number of batches: {total_batches}") - - with torch.no_grad(): - # Processes each batch in the chunk. - batches_processed = 0 - for mix_wave in mix_waves: - batches_processed += 1 - self.logger.debug(f"Processing mix_wave batch {batches_processed}/{total_batches}") - - # Runs the model to separate the sources. - tar_waves = self.run_model(mix_wave, is_match_mix=is_match_mix) - - # Applies windowing if needed and accumulates the results. - if window is not None: - tar_waves[..., :chunk_size_actual] *= window - divider[..., start:end] += window - else: - divider[..., start:end] += 1 - - result[..., start:end] += tar_waves[..., : end - start] - - # Normalizes the results by the divider to account for overlap. - self.logger.debug("Normalizing result by dividing result by divider.") - tar_waves = result / divider - tar_waves_.append(tar_waves) - - # Reshapes the results to match the original dimensions. - tar_waves_ = np.vstack(tar_waves_)[:, :, self.trim : -self.trim] - tar_waves = np.concatenate(tar_waves_, axis=-1)[:, : mix.shape[-1]] - - # Extracts the source from the results. - source = tar_waves[:, 0:None] - self.logger.debug(f"Concatenated tar_waves. Shape: {tar_waves.shape}") - - # TODO: In UVR, pitch changing happens here. Consider implementing this as a feature. - - # Compensates the source if not matching the mix. - if not is_match_mix: - source * self.compensate - self.logger.debug("Match mix mode; compensate multiplier applied.") - - # TODO: In UVR, VR denoise model gets applied here. Consider implementing this as a feature. - - self.logger.debug("Demixing process completed.") - return source - - def run_model(self, mix, is_match_mix=False): - # Applying the STFT to the mix. The mix is moved to the specified device (e.g., GPU) before processing. - # self.logger.debug(f"Running STFT on the mix. Mix shape before STFT: {mix.shape}") - spek = self.stft(mix.to(self.device)) - self.logger.debug(f"STFT applied on mix. Spectrum shape: {spek.shape}") - - # Zeroing out the first 3 bins of the spectrum. This is often done to reduce low-frequency noise. - spek[:, :, :3, :] *= 0 - # self.logger.debug("First 3 bins of the spectrum zeroed out.") - - # Handling the case where the mix needs to be matched (is_match_mix = True) - if is_match_mix: - # self.logger.debug("Match mix mode is enabled. Converting spectrum to NumPy array.") - spec_pred = spek.cpu().numpy() - self.logger.debug("is_match_mix: spectrum prediction obtained directly from STFT output.") - else: - # If denoising is enabled, the model is run on both the negative and positive spectrums. - if self.denoise_enabled: - spec_pred = -self.model_run(-spek) * 0.5 + self.model_run(spek) * 0.5 - self.logger.debug("Model run on both negative and positive spectrums for denoising.") - else: - spec_pred = self.model_run(spek) - self.logger.debug("Model run on the spectrum without denoising.") - - # Applying the inverse STFT to convert the spectrum back to the time domain. - result = self.stft.inverse(torch.tensor(spec_pred).to(self.device)).cpu().detach().numpy() - self.logger.debug(f"Inverse STFT applied. Returning result with shape: {result.shape}") - - return result - - def prepare_mix(self, mix): - # Store the original path or the mix itself for later checks - audio_path = mix - - # Check if the input is a file path (string) and needs to be loaded - if not isinstance(mix, np.ndarray): - self.logger.debug(f"Loading audio from file: {mix}") - mix, sr = librosa.load(mix, mono=False, sr=self.sample_rate) - self.logger.debug(f"Audio loaded. Sample rate: {sr}, Audio shape: {mix.shape}") - else: - # Transpose the mix if it's already an ndarray (expected shape: [channels, samples]) - self.logger.debug("Transposing the provided mix array.") - mix = mix.T - self.logger.debug(f"Transposed mix shape: {mix.shape}") - - # If the original input was a filepath, check if the loaded mix is empty - if isinstance(audio_path, str): - if not np.any(mix): - error_msg = f"Audio file {audio_path} is empty or not valid" - self.logger.error(error_msg) - raise ValueError(error_msg) - else: - self.logger.debug("Audio file is valid and contains data.") - - # Ensure the mix is in stereo format - if mix.ndim == 1: - self.logger.debug("Mix is mono. Converting to stereo.") - mix = np.asfortranarray([mix, mix]) - self.logger.debug("Converted to stereo mix.") - - # Final log indicating successful preparation of the mix - self.logger.debug("Mix preparation completed.") - return mix + return output_files diff --git a/audio_separator/separator/spec_utils.py b/audio_separator/separator/spec_utils.py deleted file mode 100644 index bf1c954..0000000 --- a/audio_separator/separator/spec_utils.py +++ /dev/null @@ -1,687 +0,0 @@ -import librosa -import numpy as np -import soundfile as sf -import math -import random -import math -import platform -import logging - -OPERATING_SYSTEM = platform.system() -SYSTEM_ARCH = platform.platform() -SYSTEM_PROC = platform.processor() -ARM = "arm" - -if OPERATING_SYSTEM == "Darwin": - wav_resolution = "polyphase" if SYSTEM_PROC == ARM or ARM in SYSTEM_ARCH else "sinc_fastest" -else: - wav_resolution = "sinc_fastest" - -MAX_SPEC = "Max Spec" -MIN_SPEC = "Min Spec" -AVERAGE = "Average" - - -def crop_center(h1, h2): - """ - This function crops the center of the first input tensor to match the size of the second input tensor. - It is used to ensure that the two tensors have the same size in the time dimension. - """ - h1_shape = h1.size() - h2_shape = h2.size() - - # If the time dimensions are already equal, return the first tensor as is - if h1_shape[3] == h2_shape[3]: - return h1 - # If the time dimension of the first tensor is smaller, raise an error - elif h1_shape[3] < h2_shape[3]: - raise ValueError("h1_shape[3] must be greater than h2_shape[3]") - - # Calculate the start and end indices for cropping - s_time = (h1_shape[3] - h2_shape[3]) // 2 - e_time = s_time + h2_shape[3] - # Crop the first tensor - h1 = h1[:, :, :, s_time:e_time] - - return h1 - - -def preprocess(X_spec): - """ - This function preprocesses a spectrogram by separating it into magnitude and phase components. - This is a common preprocessing step in audio processing tasks. - """ - X_mag = np.abs(X_spec) - X_phase = np.angle(X_spec) - - return X_mag, X_phase - - -def make_padding(width, cropsize, offset): - """ - This function calculates the padding needed to make the width of an image divisible by the crop size. - It is used in the process of splitting an image into smaller patches. - """ - left = offset - roi_size = cropsize - offset * 2 - if roi_size == 0: - roi_size = cropsize - right = roi_size - (width % roi_size) + left - - return left, right, roi_size - - -def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False): - """ - This function converts a stereo audio waveform into a spectrogram. - It supports several options for processing the stereo channels, such as mid-side processing and reversing. - """ - # Process the stereo channels based on the provided options - if reverse: - wave_left = np.flip(np.asfortranarray(wave[0])) - wave_right = np.flip(np.asfortranarray(wave[1])) - elif mid_side: - wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) - elif mid_side_b2: - wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5)) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5)) - else: - wave_left = np.asfortranarray(wave[0]) - wave_right = np.asfortranarray(wave[1]) - - # Compute the spectrogram for each channel - spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length) - spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length) - - # Combine the spectrograms into a single array - spec = np.asfortranarray([spec_left, spec_right]) - - return spec - - -def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False): - """ - This function is similar to wave_to_spectrogram, but it uses multithreading to compute the spectrograms for the two channels in parallel. - This can provide a speedup on systems with multiple cores. - """ - import threading - - if reverse: - wave_left = np.flip(np.asfortranarray(wave[0])) - wave_right = np.flip(np.asfortranarray(wave[1])) - elif mid_side: - wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) - elif mid_side_b2: - wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5)) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5)) - else: - wave_left = np.asfortranarray(wave[0]) - wave_right = np.asfortranarray(wave[1]) - - def run_thread(**kwargs): - global spec_left - spec_left = librosa.stft(**kwargs) - - # Start two threads to compute the spectrograms in parallel - thread = threading.Thread(target=run_thread, kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length}) - thread.start() - spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length) - thread.join() - - # Combine the spectrograms into a single array - spec = np.asfortranarray([spec_left, spec_right]) - - return spec - - -def normalize(logger: logging.Logger, wave, max_peak=1.0): - """Normalize audio waveform to a specified peak value. - - Args: - logger (logging.Logger): Logger for debugging information. - wave (array-like): Audio waveform. - max_peak (float): Maximum peak value for normalization. - - Returns: - array-like: Normalized or original waveform. - """ - maxv = np.abs(wave).max() - if maxv > max_peak: - logger.debug(f"Maximum peak amplitude above clipping threshold, normalizing from {maxv} to max peak {max_peak}.") - wave *= max_peak / maxv - else: - logger.debug(f"Maximum peak amplitude not above clipping threshold, no need to normalize: {maxv}") - - return wave - - -def normalize_two_stem(logger: logging.Logger, wave, mix, is_normalize=False): - """Save output music files""" - - maxv = np.abs(wave).max() - max_mix = np.abs(mix).max() - - if maxv > 1.0: - logger.debug(f"Normalization Set {is_normalize}: Primary source above threshold for clipping. Max:{maxv}") - logger.debug(f"Normalization Set {is_normalize}: Mixture above threshold for clipping. Max:{max_mix}") - if is_normalize: - logger.debug(f"The result was normalized.") - wave /= maxv - mix /= maxv - else: - logger.debug(f"The result was not normalized.") - else: - logger.debug(f"Normalization Set {is_normalize}: Input not above threshold for clipping. Max:{maxv}") - - logger.debug(f"Normalization Set {is_normalize}: Primary source - Max:{np.abs(wave).max()}") - logger.debug(f"Normalization Set {is_normalize}: Mixture - Max:{np.abs(mix).max()}") - - return wave, mix - - -def combine_spectrograms(specs, mp): - l = min([specs[i].shape[2] for i in specs]) - spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64) - offset = 0 - bands_n = len(mp.param["band"]) - - for d in range(1, bands_n + 1): - h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"] - spec_c[:, offset : offset + h, :l] = specs[d][:, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l] - offset += h - - if offset > mp.param["bins"]: - raise ValueError("Too much bins") - - # lowpass fiter - if mp.param["pre_filter_start"] > 0: # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']: - if bands_n == 1: - spec_c = fft_lp_filter(spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]) - else: - gp = 1 - for b in range(mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]): - g = math.pow(10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0) - gp = g - spec_c[:, b, :] *= g - - return np.asfortranarray(spec_c) - - -def spectrogram_to_image(spec, mode="magnitude"): - if mode == "magnitude": - if np.iscomplexobj(spec): - y = np.abs(spec) - else: - y = spec - y = np.log10(y**2 + 1e-8) - elif mode == "phase": - if np.iscomplexobj(spec): - y = np.angle(spec) - else: - y = spec - - y -= y.min() - y *= 255 / y.max() - img = np.uint8(y) - - if y.ndim == 3: - img = img.transpose(1, 2, 0) - img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2) - - return img - - -def reduce_vocal_aggressively(X, y, softmask): - v = X - y - y_mag_tmp = np.abs(y) - v_mag_tmp = np.abs(v) - - v_mask = v_mag_tmp > y_mag_tmp - y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf) - - return y_mag * np.exp(1.0j * np.angle(y)) - - -def align_wave_head_and_tail(a, b): - l = min([a[0].size, b[0].size]) - - return a[:l, :l], b[:l, :l] - - -def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse, clamp=False): - spec_left = np.asfortranarray(spec[0]) - spec_right = np.asfortranarray(spec[1]) - - wave_left = librosa.istft(spec_left, hop_length=hop_length) - wave_right = librosa.istft(spec_right, hop_length=hop_length) - - if reverse: - return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) - elif mid_side: - return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) - elif mid_side_b2: - return np.asfortranarray([np.add(wave_right / 1.25, 0.4 * wave_left), np.subtract(wave_left / 1.25, 0.4 * wave_right)]) - else: - return np.asfortranarray([wave_left, wave_right]) - - -def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2): - import threading - - spec_left = np.asfortranarray(spec[0]) - spec_right = np.asfortranarray(spec[1]) - - def run_thread(**kwargs): - global wave_left - wave_left = librosa.istft(**kwargs) - - thread = threading.Thread(target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length}) - thread.start() - wave_right = librosa.istft(spec_right, hop_length=hop_length) - thread.join() - - if reverse: - return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) - elif mid_side: - return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) - elif mid_side_b2: - return np.asfortranarray([np.add(wave_right / 1.25, 0.4 * wave_left), np.subtract(wave_left / 1.25, 0.4 * wave_right)]) - else: - return np.asfortranarray([wave_left, wave_right]) - - -def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None): - bands_n = len(mp.param["band"]) - offset = 0 - - for d in range(1, bands_n + 1): - bp = mp.param["band"][d] - spec_s = np.ndarray(shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex) - h = bp["crop_stop"] - bp["crop_start"] - spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[:, offset : offset + h, :] - - offset += h - if d == bands_n: # higher - if extra_bins_h: # if --high_end_process bypass - max_bin = bp["n_fft"] // 2 - spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[:, :extra_bins_h, :] - if bp["hpf_start"] > 0: - spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) - if bands_n == 1: - wave = spectrogram_to_wave(spec_s, bp["hl"], mp.param["mid_side"], mp.param["mid_side_b2"], mp.param["reverse"]) - else: - wave = np.add( - wave, spectrogram_to_wave(spec_s, bp["hl"], mp.param["mid_side"], mp.param["mid_side_b2"], mp.param["reverse"]) - ) - else: - sr = mp.param["band"][d + 1]["sr"] - if d == 1: # lower - spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) - wave = librosa.resample( - spectrogram_to_wave(spec_s, bp["hl"], mp.param["mid_side"], mp.param["mid_side_b2"], mp.param["reverse"]), - bp["sr"], - sr, - res_type=wav_resolution, - ) - else: # mid - spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) - spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) - wave2 = np.add( - wave, spectrogram_to_wave(spec_s, bp["hl"], mp.param["mid_side"], mp.param["mid_side_b2"], mp.param["reverse"]) - ) - wave = librosa.resample(wave2, bp["sr"], sr, res_type=wav_resolution) - - return wave - - -def fft_lp_filter(spec, bin_start, bin_stop): - g = 1.0 - for b in range(bin_start, bin_stop): - g -= 1 / (bin_stop - bin_start) - spec[:, b, :] = g * spec[:, b, :] - - spec[:, bin_stop:, :] *= 0 - - return spec - - -def fft_hp_filter(spec, bin_start, bin_stop): - g = 1.0 - for b in range(bin_start, bin_stop, -1): - g -= 1 / (bin_start - bin_stop) - spec[:, b, :] = g * spec[:, b, :] - - spec[:, 0 : bin_stop + 1, :] *= 0 - - return spec - - -def mirroring(a, spec_m, input_high_end, mp): - if "mirroring" == a: - mirror = np.flip( - np.abs(spec_m[:, mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :]), 1 - ) - mirror = mirror * np.exp(1.0j * np.angle(input_high_end)) - - return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror) - - if "mirroring2" == a: - mirror = np.flip( - np.abs(spec_m[:, mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :]), 1 - ) - mi = np.multiply(mirror, input_high_end * 1.7) - - return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi) - - -def adjust_aggr(mask, is_non_accom_stem, aggressiveness): - aggr = aggressiveness["value"] - - if aggr != 0: - if is_non_accom_stem: - aggr = 1 - aggr - - aggr = [aggr, aggr] - - if aggressiveness["aggr_correction"] is not None: - aggr[0] += aggressiveness["aggr_correction"]["left"] - aggr[1] += aggressiveness["aggr_correction"]["right"] - - for ch in range(2): - mask[ch, : aggressiveness["split_bin"]] = np.power(mask[ch, : aggressiveness["split_bin"]], 1 + aggr[ch] / 3) - mask[ch, aggressiveness["split_bin"] :] = np.power(mask[ch, aggressiveness["split_bin"] :], 1 + aggr[ch]) - - # if is_non_accom_stem: - # mask = (1.0 - mask) - - return mask - - -def stft(wave, nfft, hl): - wave_left = np.asfortranarray(wave[0]) - wave_right = np.asfortranarray(wave[1]) - spec_left = librosa.stft(wave_left, nfft, hop_length=hl) - spec_right = librosa.stft(wave_right, nfft, hop_length=hl) - spec = np.asfortranarray([spec_left, spec_right]) - - return spec - - -def istft(spec, hl): - spec_left = np.asfortranarray(spec[0]) - spec_right = np.asfortranarray(spec[1]) - wave_left = librosa.istft(spec_left, hop_length=hl) - wave_right = librosa.istft(spec_right, hop_length=hl) - wave = np.asfortranarray([wave_left, wave_right]) - - return wave - - -def spec_effects(wave, algorithm="Default", value=None): - spec = [stft(wave[0], 2048, 1024), stft(wave[1], 2048, 1024)] - if algorithm == "Min_Mag": - v_spec_m = np.where(np.abs(spec[1]) <= np.abs(spec[0]), spec[1], spec[0]) - wave = istft(v_spec_m, 1024) - elif algorithm == "Max_Mag": - v_spec_m = np.where(np.abs(spec[1]) >= np.abs(spec[0]), spec[1], spec[0]) - wave = istft(v_spec_m, 1024) - elif algorithm == "Default": - wave = (wave[1] * value) + (wave[0] * (1 - value)) - elif algorithm == "Invert_p": - X_mag = np.abs(spec[0]) - y_mag = np.abs(spec[1]) - max_mag = np.where(X_mag >= y_mag, X_mag, y_mag) - v_spec = spec[1] - max_mag * np.exp(1.0j * np.angle(spec[0])) - wave = istft(v_spec, 1024) - - return wave - - -def spectrogram_to_wave_no_mp(spec, n_fft=2048, hop_length=1024): - wave = librosa.istft(spec, n_fft=n_fft, hop_length=hop_length) - - if wave.ndim == 1: - wave = np.asfortranarray([wave, wave]) - - return wave - - -def wave_to_spectrogram_no_mp(wave): - spec = librosa.stft(wave, n_fft=2048, hop_length=1024) - - if spec.ndim == 1: - spec = np.asfortranarray([spec, spec]) - - return spec - - -def invert_audio(specs, invert_p=True): - ln = min([specs[0].shape[2], specs[1].shape[2]]) - specs[0] = specs[0][:, :, :ln] - specs[1] = specs[1][:, :, :ln] - - if invert_p: - X_mag = np.abs(specs[0]) - y_mag = np.abs(specs[1]) - max_mag = np.where(X_mag >= y_mag, X_mag, y_mag) - v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0])) - else: - specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2) - v_spec = specs[0] - specs[1] - - return v_spec - - -def invert_stem(mixture, stem): - mixture = wave_to_spectrogram_no_mp(mixture) - stem = wave_to_spectrogram_no_mp(stem) - output = spectrogram_to_wave_no_mp(invert_audio([mixture, stem])) - - return -output.T - - -def ensembling(a, specs): - for i in range(1, len(specs)): - if i == 1: - spec = specs[0] - - ln = min([spec.shape[2], specs[i].shape[2]]) - spec = spec[:, :, :ln] - specs[i] = specs[i][:, :, :ln] - - if MIN_SPEC == a: - spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec) - if MAX_SPEC == a: - spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec) - if AVERAGE == a: - spec = np.where(np.abs(specs[i]) == np.abs(spec), specs[i], spec) - - return spec - - -def ensemble_inputs(audio_input, algorithm, is_normalization, wav_type_set, save_path): - wavs_ = [] - - if algorithm == AVERAGE: - output = average_audio(audio_input) - samplerate = 44100 - else: - specs = [] - - for i in range(len(audio_input)): - wave, samplerate = librosa.load(audio_input[i], mono=False, sr=44100) - wavs_.append(wave) - spec = wave_to_spectrogram_no_mp(wave) - specs.append(spec) - - wave_shapes = [w.shape[1] for w in wavs_] - target_shape = wavs_[wave_shapes.index(max(wave_shapes))] - - output = spectrogram_to_wave_no_mp(ensembling(algorithm, specs)) - output = to_shape(output, target_shape.shape) - - sf.write(save_path, normalize(output.T, is_normalization), samplerate, subtype=wav_type_set) - - -def to_shape(x, target_shape): - padding_list = [] - for x_dim, target_dim in zip(x.shape, target_shape): - pad_value = target_dim - x_dim - pad_tuple = (0, pad_value) - padding_list.append(pad_tuple) - - return np.pad(x, tuple(padding_list), mode="constant") - - -def to_shape_minimize(x: np.ndarray, target_shape): - padding_list = [] - for x_dim, target_dim in zip(x.shape, target_shape): - pad_value = target_dim - x_dim - pad_tuple = (0, pad_value) - padding_list.append(pad_tuple) - - return np.pad(x, tuple(padding_list), mode="constant") - - -def average_audio(audio): - waves = [] - wave_shapes = [] - final_waves = [] - - for i in range(len(audio)): - wave = librosa.load(audio[i], sr=44100, mono=False) - waves.append(wave[0]) - wave_shapes.append(wave[0].shape[1]) - - wave_shapes_index = wave_shapes.index(max(wave_shapes)) - target_shape = waves[wave_shapes_index] - waves.pop(wave_shapes_index) - final_waves.append(target_shape) - - for n_array in waves: - wav_target = to_shape(n_array, target_shape.shape) - final_waves.append(wav_target) - - waves = sum(final_waves) - waves = waves / len(audio) - - return waves - - -def average_dual_sources(wav_1, wav_2, value): - if wav_1.shape > wav_2.shape: - wav_2 = to_shape(wav_2, wav_1.shape) - if wav_1.shape < wav_2.shape: - wav_1 = to_shape(wav_1, wav_2.shape) - - wave = (wav_1 * value) + (wav_2 * (1 - value)) - - return wave - - -def reshape_sources(wav_1: np.ndarray, wav_2: np.ndarray): - if wav_1.shape > wav_2.shape: - wav_2 = to_shape(wav_2, wav_1.shape) - if wav_1.shape < wav_2.shape: - ln = min([wav_1.shape[1], wav_2.shape[1]]) - wav_2 = wav_2[:, :ln] - - ln = min([wav_1.shape[1], wav_2.shape[1]]) - wav_1 = wav_1[:, :ln] - wav_2 = wav_2[:, :ln] - - return wav_2 - - -def align_audio( - file1, file2, file2_aligned, file_subtracted, wav_type_set, is_normalization, command_Text, progress_bar_main_var, save_format -): - def get_diff(a, b): - corr = np.correlate(a, b, "full") - diff = corr.argmax() - (b.shape[0] - 1) - return diff - - progress_bar_main_var.set(10) - - # read tracks - wav1, sr1 = librosa.load(file1, sr=44100, mono=False) - wav2, sr2 = librosa.load(file2, sr=44100, mono=False) - wav1 = wav1.transpose() - wav2 = wav2.transpose() - - command_Text(f"Audio file shapes: {wav1.shape} / {wav2.shape}\n") - - wav2_org = wav2.copy() - progress_bar_main_var.set(20) - - command_Text("Processing files... \n") - - # pick random position and get diff - - counts = {} # counting up for each diff value - progress = 20 - - check_range = 64 - - base = 64 / check_range - - for i in range(check_range): - index = int(random.uniform(44100 * 2, min(wav1.shape[0], wav2.shape[0]) - 44100 * 2)) - shift = int(random.uniform(-22050, +22050)) - samp1 = wav1[index : index + 44100, 0] # currently use left channel - samp2 = wav2[index + shift : index + shift + 44100, 0] - progress += 1 * base - progress_bar_main_var.set(progress) - diff = get_diff(samp1, samp2) - diff -= shift - - if abs(diff) < 22050: - if not diff in counts: - counts[diff] = 0 - counts[diff] += 1 - - # use max counted diff value - max_count = 0 - est_diff = 0 - for diff in counts.keys(): - if counts[diff] > max_count: - max_count = counts[diff] - est_diff = diff - - command_Text(f"Estimated difference is {est_diff} (count: {max_count})\n") - - progress_bar_main_var.set(90) - - audio_files = [] - - def save_aligned_audio(wav2_aligned): - command_Text(f"Aligned File 2 with File 1.\n") - command_Text(f"Saving files... ") - sf.write(file2_aligned, normalize(wav2_aligned, is_normalization), sr2, subtype=wav_type_set) - save_format(file2_aligned) - min_len = min(wav1.shape[0], wav2_aligned.shape[0]) - wav_sub = wav1[:min_len] - wav2_aligned[:min_len] - audio_files.append(file2_aligned) - return min_len, wav_sub - - # make aligned track 2 - if est_diff > 0: - wav2_aligned = np.append(np.zeros((est_diff, 2)), wav2_org, axis=0) - min_len, wav_sub = save_aligned_audio(wav2_aligned) - elif est_diff < 0: - wav2_aligned = wav2_org[-est_diff:] - min_len, wav_sub = save_aligned_audio(wav2_aligned) - else: - command_Text(f"Audio files already aligned.\n") - command_Text(f"Saving inverted track... ") - min_len = min(wav1.shape[0], wav2.shape[0]) - wav_sub = wav1[:min_len] - wav2[:min_len] - - wav_sub = np.clip(wav_sub, -1, +1) - - sf.write(file_subtracted, normalize(wav_sub, is_normalization), sr1, subtype=wav_type_set) - save_format(file_subtracted) - - progress_bar_main_var.set(95) diff --git a/audio_separator/separator/uvr_lib_v5/mdxnet.py b/audio_separator/separator/uvr_lib_v5/mdxnet.py new file mode 100644 index 0000000..3293c89 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/mdxnet.py @@ -0,0 +1,136 @@ +import torch +import torch.nn as nn +from .modules import TFC_TDF +from pytorch_lightning import LightningModule + +dim_s = 4 + +class AbstractMDXNet(LightningModule): + def __init__(self, target_name, lr, optimizer, dim_c, dim_f, dim_t, n_fft, hop_length, overlap): + super().__init__() + self.target_name = target_name + self.lr = lr + self.optimizer = optimizer + self.dim_c = dim_c + self.dim_f = dim_f + self.dim_t = dim_t + self.n_fft = n_fft + self.n_bins = n_fft // 2 + 1 + self.hop_length = hop_length + self.window = nn.Parameter(torch.hann_window(window_length=self.n_fft, periodic=True), requires_grad=False) + self.freq_pad = nn.Parameter(torch.zeros([1, dim_c, self.n_bins - self.dim_f, self.dim_t]), requires_grad=False) + + def get_optimizer(self): + if self.optimizer == 'rmsprop': + return torch.optim.RMSprop(self.parameters(), self.lr) + + if self.optimizer == 'adamw': + return torch.optim.AdamW(self.parameters(), self.lr) + +class ConvTDFNet(AbstractMDXNet): + def __init__(self, target_name, lr, optimizer, dim_c, dim_f, dim_t, n_fft, hop_length, + num_blocks, l, g, k, bn, bias, overlap): + + super(ConvTDFNet, self).__init__( + target_name, lr, optimizer, dim_c, dim_f, dim_t, n_fft, hop_length, overlap) + #self.save_hyperparameters() + + self.num_blocks = num_blocks + self.l = l + self.g = g + self.k = k + self.bn = bn + self.bias = bias + + if optimizer == 'rmsprop': + norm = nn.BatchNorm2d + + if optimizer == 'adamw': + norm = lambda input:nn.GroupNorm(2, input) + + self.n = num_blocks // 2 + scale = (2, 2) + + self.first_conv = nn.Sequential( + nn.Conv2d(in_channels=self.dim_c, out_channels=g, kernel_size=(1, 1)), + norm(g), + nn.ReLU(), + ) + + f = self.dim_f + c = g + self.encoding_blocks = nn.ModuleList() + self.ds = nn.ModuleList() + for i in range(self.n): + self.encoding_blocks.append(TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm)) + self.ds.append( + nn.Sequential( + nn.Conv2d(in_channels=c, out_channels=c + g, kernel_size=scale, stride=scale), + norm(c + g), + nn.ReLU() + ) + ) + f = f // 2 + c += g + + self.bottleneck_block = TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm) + + self.decoding_blocks = nn.ModuleList() + self.us = nn.ModuleList() + for i in range(self.n): + self.us.append( + nn.Sequential( + nn.ConvTranspose2d(in_channels=c, out_channels=c - g, kernel_size=scale, stride=scale), + norm(c - g), + nn.ReLU() + ) + ) + f = f * 2 + c -= g + + self.decoding_blocks.append(TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm)) + + self.final_conv = nn.Sequential( + nn.Conv2d(in_channels=c, out_channels=self.dim_c, kernel_size=(1, 1)), + ) + + def forward(self, x): + + x = self.first_conv(x) + + x = x.transpose(-1, -2) + + ds_outputs = [] + for i in range(self.n): + x = self.encoding_blocks[i](x) + ds_outputs.append(x) + x = self.ds[i](x) + + x = self.bottleneck_block(x) + + for i in range(self.n): + x = self.us[i](x) + x *= ds_outputs[-i - 1] + x = self.decoding_blocks[i](x) + + x = x.transpose(-1, -2) + + x = self.final_conv(x) + + return x + +class Mixer(nn.Module): + def __init__(self, device, mixer_path): + + super(Mixer, self).__init__() + + self.linear = nn.Linear((dim_s+1)*2, dim_s*2, bias=False) + + self.load_state_dict( + torch.load(mixer_path, map_location=device) + ) + + def forward(self, x): + x = x.reshape(1,(dim_s+1)*2,-1).transpose(-1,-2) + x = self.linear(x) + return x.transpose(-1,-2).reshape(dim_s,2,-1) \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/mixer.ckpt b/audio_separator/separator/uvr_lib_v5/mixer.ckpt new file mode 100644 index 0000000..986cc4d Binary files /dev/null and b/audio_separator/separator/uvr_lib_v5/mixer.ckpt differ diff --git a/audio_separator/separator/uvr_lib_v5/modules.py b/audio_separator/separator/uvr_lib_v5/modules.py new file mode 100644 index 0000000..4e77d2f --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/modules.py @@ -0,0 +1,74 @@ +import torch +import torch.nn as nn + + +class TFC(nn.Module): + def __init__(self, c, l, k, norm): + super(TFC, self).__init__() + + self.H = nn.ModuleList() + for i in range(l): + self.H.append( + nn.Sequential( + nn.Conv2d(in_channels=c, out_channels=c, kernel_size=k, stride=1, padding=k // 2), + norm(c), + nn.ReLU(), + ) + ) + + def forward(self, x): + for h in self.H: + x = h(x) + return x + + +class DenseTFC(nn.Module): + def __init__(self, c, l, k, norm): + super(DenseTFC, self).__init__() + + self.conv = nn.ModuleList() + for i in range(l): + self.conv.append( + nn.Sequential( + nn.Conv2d(in_channels=c, out_channels=c, kernel_size=k, stride=1, padding=k // 2), + norm(c), + nn.ReLU(), + ) + ) + + def forward(self, x): + for layer in self.conv[:-1]: + x = torch.cat([layer(x), x], 1) + return self.conv[-1](x) + + +class TFC_TDF(nn.Module): + def __init__(self, c, l, f, k, bn, dense=False, bias=True, norm=nn.BatchNorm2d): + + super(TFC_TDF, self).__init__() + + self.use_tdf = bn is not None + + self.tfc = DenseTFC(c, l, k, norm) if dense else TFC(c, l, k, norm) + + if self.use_tdf: + if bn == 0: + self.tdf = nn.Sequential( + nn.Linear(f, f, bias=bias), + norm(c), + nn.ReLU() + ) + else: + self.tdf = nn.Sequential( + nn.Linear(f, f // bn, bias=bias), + norm(c), + nn.ReLU(), + nn.Linear(f // bn, f, bias=bias), + norm(c), + nn.ReLU() + ) + + def forward(self, x): + x = self.tfc(x) + return x + self.tdf(x) if self.use_tdf else x + diff --git a/audio_separator/separator/uvr_lib_v5/playsound.py b/audio_separator/separator/uvr_lib_v5/playsound.py new file mode 100644 index 0000000..abd708e --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/playsound.py @@ -0,0 +1,242 @@ +import logging +logger = logging.getLogger(__name__) + +class PlaysoundException(Exception): + pass + +def _canonicalizePath(path): + """ + Support passing in a pathlib.Path-like object by converting to str. + """ + import sys + if sys.version_info[0] >= 3: + return str(path) + else: + # On earlier Python versions, str is a byte string, so attempting to + # convert a unicode string to str will fail. Leave it alone in this case. + return path + +def _playsoundWin(sound, block = True): + ''' + Utilizes windll.winmm. Tested and known to work with MP3 and WAVE on + Windows 7 with Python 2.7. Probably works with more file formats. + Probably works on Windows XP thru Windows 10. Probably works with all + versions of Python. + + Inspired by (but not copied from) Michael Gundlach 's mp3play: + https://github.com/michaelgundlach/mp3play + + I never would have tried using windll.winmm without seeing his code. + ''' + sound = '"' + _canonicalizePath(sound) + '"' + + from ctypes import create_unicode_buffer, windll, wintypes + from time import sleep + windll.winmm.mciSendStringW.argtypes = [wintypes.LPCWSTR, wintypes.LPWSTR, wintypes.UINT, wintypes.HANDLE] + windll.winmm.mciGetErrorStringW.argtypes = [wintypes.DWORD, wintypes.LPWSTR, wintypes.UINT] + + def winCommand(*command): + bufLen = 600 + buf = create_unicode_buffer(bufLen) + command = ' '.join(command) + errorCode = int(windll.winmm.mciSendStringW(command, buf, bufLen - 1, 0)) # use widestring version of the function + if errorCode: + errorBuffer = create_unicode_buffer(bufLen) + windll.winmm.mciGetErrorStringW(errorCode, errorBuffer, bufLen - 1) # use widestring version of the function + exceptionMessage = ('\n Error ' + str(errorCode) + ' for command:' + '\n ' + command + + '\n ' + errorBuffer.value) + logger.error(exceptionMessage) + raise PlaysoundException(exceptionMessage) + return buf.value + + try: + logger.debug('Starting') + winCommand(u'open {}'.format(sound)) + winCommand(u'play {}{}'.format(sound, ' wait' if block else '')) + logger.debug('Returning') + finally: + try: + winCommand(u'close {}'.format(sound)) + except PlaysoundException: + logger.warning(u'Failed to close the file: {}'.format(sound)) + # If it fails, there's nothing more that can be done... + pass + +def _handlePathOSX(sound): + sound = _canonicalizePath(sound) + + if '://' not in sound: + if not sound.startswith('/'): + from os import getcwd + sound = getcwd() + '/' + sound + sound = 'file://' + sound + + try: + # Don't double-encode it. + sound.encode('ascii') + return sound.replace(' ', '%20') + except UnicodeEncodeError: + try: + from urllib.parse import quote # Try the Python 3 import first... + except ImportError: + from urllib import quote # Try using the Python 2 import before giving up entirely... + + parts = sound.split('://', 1) + return parts[0] + '://' + quote(parts[1].encode('utf-8')).replace(' ', '%20') + + +def _playsoundOSX(sound, block = True): + ''' + Utilizes AppKit.NSSound. Tested and known to work with MP3 and WAVE on + OS X 10.11 with Python 2.7. Probably works with anything QuickTime supports. + Probably works on OS X 10.5 and newer. Probably works with all versions of + Python. + + Inspired by (but not copied from) Aaron's Stack Overflow answer here: + http://stackoverflow.com/a/34568298/901641 + + I never would have tried using AppKit.NSSound without seeing his code. + ''' + try: + from AppKit import NSSound + except ImportError: + logger.warning("playsound could not find a copy of AppKit - falling back to using macOS's system copy.") + sys.path.append('/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/PyObjC') + from AppKit import NSSound + + from Foundation import NSURL + from time import sleep + + sound = _handlePathOSX(sound) + url = NSURL.URLWithString_(sound) + if not url: + raise PlaysoundException('Cannot find a sound with filename: ' + sound) + + for i in range(5): + nssound = NSSound.alloc().initWithContentsOfURL_byReference_(url, True) + if nssound: + break + else: + logger.debug('Failed to load sound, although url was good... ' + sound) + else: + raise PlaysoundException('Could not load sound with filename, although URL was good... ' + sound) + nssound.play() + + if block: + sleep(nssound.duration()) + +def _playsoundNix(sound, block = True): + """Play a sound using GStreamer. + + Inspired by this: + https://gstreamer.freedesktop.org/documentation/tutorials/playback/playbin-usage.html + """ + sound = _canonicalizePath(sound) + + # pathname2url escapes non-URL-safe characters + from os.path import abspath, exists + try: + from urllib.request import pathname2url + except ImportError: + # python 2 + from urllib import pathname2url + + import gi + gi.require_version('Gst', '1.0') + from gi.repository import Gst + + Gst.init(None) + + playbin = Gst.ElementFactory.make('playbin', 'playbin') + if sound.startswith(('http://', 'https://')): + playbin.props.uri = sound + else: + path = abspath(sound) + if not exists(path): + raise PlaysoundException(u'File not found: {}'.format(path)) + playbin.props.uri = 'file://' + pathname2url(path) + + + set_result = playbin.set_state(Gst.State.PLAYING) + if set_result != Gst.StateChangeReturn.ASYNC: + raise PlaysoundException( + "playbin.set_state returned " + repr(set_result)) + + # FIXME: use some other bus method than poll() with block=False + # https://lazka.github.io/pgi-docs/#Gst-1.0/classes/Bus.html + logger.debug('Starting play') + if block: + bus = playbin.get_bus() + try: + bus.poll(Gst.MessageType.EOS, Gst.CLOCK_TIME_NONE) + finally: + playbin.set_state(Gst.State.NULL) + + logger.debug('Finishing play') + +def _playsoundAnotherPython(otherPython, sound, block = True, macOS = False): + ''' + Mostly written so that when this is run on python3 on macOS, it can invoke + python2 on macOS... but maybe this idea could be useful on linux, too. + ''' + from inspect import getsourcefile + from os.path import abspath, exists + from subprocess import check_call + from threading import Thread + + sound = _canonicalizePath(sound) + + class PropogatingThread(Thread): + def run(self): + self.exc = None + try: + self.ret = self._target(*self._args, **self._kwargs) + except BaseException as e: + self.exc = e + + def join(self, timeout = None): + super().join(timeout) + if self.exc: + raise self.exc + return self.ret + + # Check if the file exists... + if not exists(abspath(sound)): + raise PlaysoundException('Cannot find a sound with filename: ' + sound) + + playsoundPath = abspath(getsourcefile(lambda: 0)) + t = PropogatingThread(target = lambda: check_call([otherPython, playsoundPath, _handlePathOSX(sound) if macOS else sound])) + t.start() + if block: + t.join() + +from platform import system +system = system() + +if system == 'Windows': + playsound_func = _playsoundWin +elif system == 'Darwin': + playsound_func = _playsoundOSX + import sys + if sys.version_info[0] > 2: + try: + from AppKit import NSSound + except ImportError: + logger.warning("playsound is relying on a python 2 subprocess. Please use `pip3 install PyObjC` if you want playsound to run more efficiently.") + playsound_func = lambda sound, block = True: _playsoundAnotherPython('/System/Library/Frameworks/Python.framework/Versions/2.7/bin/python', sound, block, macOS = True) +else: + playsound_func = _playsoundNix + if __name__ != '__main__': # Ensure we don't infinitely recurse trying to get another python instance. + try: + import gi + gi.require_version('Gst', '1.0') + from gi.repository import Gst + except: + logger.warning("playsound is relying on another python subprocess. Please use `pip install pygobject` if you want playsound to run more efficiently.") + playsound_func = lambda sound, block = True: _playsoundAnotherPython('/usr/bin/python3', sound, block, macOS = False) + +del system + +def play(audio_filepath): + playsound_func(audio_filepath) diff --git a/audio_separator/separator/uvr_lib_v5/pyrb.py b/audio_separator/separator/uvr_lib_v5/pyrb.py new file mode 100644 index 0000000..883a525 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/pyrb.py @@ -0,0 +1,92 @@ +import os +import subprocess +import tempfile +import six +import numpy as np +import soundfile as sf +import sys + +if getattr(sys, 'frozen', False): + BASE_PATH_RUB = sys._MEIPASS +else: + BASE_PATH_RUB = os.path.dirname(os.path.abspath(__file__)) + +__all__ = ['time_stretch', 'pitch_shift'] + +__RUBBERBAND_UTIL = os.path.join(BASE_PATH_RUB, 'rubberband') + +if six.PY2: + DEVNULL = open(os.devnull, 'w') +else: + DEVNULL = subprocess.DEVNULL + +def __rubberband(y, sr, **kwargs): + + assert sr > 0 + + # Get the input and output tempfile + fd, infile = tempfile.mkstemp(suffix='.wav') + os.close(fd) + fd, outfile = tempfile.mkstemp(suffix='.wav') + os.close(fd) + + # dump the audio + sf.write(infile, y, sr) + + try: + # Execute rubberband + arguments = [__RUBBERBAND_UTIL, '-q'] + + for key, value in six.iteritems(kwargs): + arguments.append(str(key)) + arguments.append(str(value)) + + arguments.extend([infile, outfile]) + + subprocess.check_call(arguments, stdout=DEVNULL, stderr=DEVNULL) + + # Load the processed audio. + y_out, _ = sf.read(outfile, always_2d=True) + + # make sure that output dimensions matches input + if y.ndim == 1: + y_out = np.squeeze(y_out) + + except OSError as exc: + six.raise_from(RuntimeError('Failed to execute rubberband. ' + 'Please verify that rubberband-cli ' + 'is installed.'), + exc) + + finally: + # Remove temp files + os.unlink(infile) + os.unlink(outfile) + + return y_out + +def time_stretch(y, sr, rate, rbargs=None): + if rate <= 0: + raise ValueError('rate must be strictly positive') + + if rate == 1.0: + return y + + if rbargs is None: + rbargs = dict() + + rbargs.setdefault('--tempo', rate) + + return __rubberband(y, sr, **rbargs) + +def pitch_shift(y, sr, n_steps, rbargs=None): + + if n_steps == 0: + return y + + if rbargs is None: + rbargs = dict() + + rbargs.setdefault('--pitch', n_steps) + + return __rubberband(y, sr, **rbargs) diff --git a/audio_separator/separator/uvr_lib_v5/results.py b/audio_separator/separator/uvr_lib_v5/results.py new file mode 100644 index 0000000..476f2d1 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/results.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- + +""" +Matchering - Audio Matching and Mastering Python Library +Copyright (C) 2016-2022 Sergree + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +""" + +import os +import soundfile as sf + + +class Result: + def __init__( + self, file: str, subtype: str, use_limiter: bool = True, normalize: bool = True + ): + _, file_ext = os.path.splitext(file) + file_ext = file_ext[1:].upper() + if not sf.check_format(file_ext): + raise TypeError(f"{file_ext} format is not supported") + if not sf.check_format(file_ext, subtype): + raise TypeError(f"{file_ext} format does not have {subtype} subtype") + self.file = file + self.subtype = subtype + self.use_limiter = use_limiter + self.normalize = normalize + + +def pcm16(file: str) -> Result: + return Result(file, "PCM_16") + +def pcm24(file: str) -> Result: + return Result(file, "FLOAT") + +def save_audiofile(file: str, wav_set="PCM_16") -> Result: + return Result(file, wav_set) diff --git a/audio_separator/separator/uvr_lib_v5/spec_utils.py b/audio_separator/separator/uvr_lib_v5/spec_utils.py new file mode 100644 index 0000000..03f099d --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/spec_utils.py @@ -0,0 +1,1309 @@ +import audioread +import librosa +import numpy as np +import soundfile as sf +import math +import platform +import traceback +from scipy.signal import correlate, hilbert +import io + +OPERATING_SYSTEM = platform.system() +SYSTEM_ARCH = platform.platform() +SYSTEM_PROC = platform.processor() +ARM = "arm" + +AUTO_PHASE = "Automatic" +POSITIVE_PHASE = "Positive Phase" +NEGATIVE_PHASE = "Negative Phase" +NONE_P = ("None",) +LOW_P = ("Shifts: Low",) +MED_P = ("Shifts: Medium",) +HIGH_P = ("Shifts: High",) +VHIGH_P = "Shifts: Very High" +MAXIMUM_P = "Shifts: Maximum" + +progress_value = 0 +last_update_time = 0 +is_macos = False + +if OPERATING_SYSTEM == "Windows": + from pyrubberband import pyrb +else: + from audio_separator.separator.uvr_lib_v5 import pyrb + +if OPERATING_SYSTEM == "Darwin": + wav_resolution = "polyphase" if SYSTEM_PROC == ARM or ARM in SYSTEM_ARCH else "sinc_fastest" + wav_resolution_float_resampling = "kaiser_best" if SYSTEM_PROC == ARM or ARM in SYSTEM_ARCH else wav_resolution + is_macos = True +else: + wav_resolution = "sinc_fastest" + wav_resolution_float_resampling = wav_resolution + +MAX_SPEC = "Max Spec" +MIN_SPEC = "Min Spec" +LIN_ENSE = "Linear Ensemble" + +MAX_WAV = MAX_SPEC +MIN_WAV = MIN_SPEC + +AVERAGE = "Average" + + +def crop_center(h1, h2): + """ + This function crops the center of the first input tensor to match the size of the second input tensor. + It is used to ensure that the two tensors have the same size in the time dimension. + """ + h1_shape = h1.size() + h2_shape = h2.size() + + # If the time dimensions are already equal, return the first tensor as is + if h1_shape[3] == h2_shape[3]: + return h1 + # If the time dimension of the first tensor is smaller, raise an error + elif h1_shape[3] < h2_shape[3]: + raise ValueError("h1_shape[3] must be greater than h2_shape[3]") + + # Calculate the start and end indices for cropping + s_time = (h1_shape[3] - h2_shape[3]) // 2 + e_time = s_time + h2_shape[3] + # Crop the first tensor + h1 = h1[:, :, :, s_time:e_time] + + return h1 + + +def preprocess(X_spec): + """ + This function preprocesses a spectrogram by separating it into magnitude and phase components. + This is a common preprocessing step in audio processing tasks. + """ + X_mag = np.abs(X_spec) + X_phase = np.angle(X_spec) + + return X_mag, X_phase + + +def make_padding(width, cropsize, offset): + """ + This function calculates the padding needed to make the width of an image divisible by the crop size. + It is used in the process of splitting an image into smaller patches. + """ + left = offset + roi_size = cropsize - offset * 2 + if roi_size == 0: + roi_size = cropsize + right = roi_size - (width % roi_size) + left + + return left, right, roi_size + + +def normalize(wave, max_peak=1.0): + """Normalize audio waveform to a specified peak value. + + Args: + wave (array-like): Audio waveform. + max_peak (float): Maximum peak value for normalization. + + Returns: + array-like: Normalized or original waveform. + """ + maxv = np.abs(wave).max() + if maxv > max_peak: + wave *= max_peak / maxv + + return wave + + +def auto_transpose(audio_array: np.ndarray): + """ + Ensure that the audio array is in the (channels, samples) format. + + Parameters: + audio_array (ndarray): Input audio array. + + Returns: + ndarray: Transposed audio array if necessary. + """ + + # If the second dimension is 2 (indicating stereo channels), transpose the array + if audio_array.shape[1] == 2: + return audio_array.T + return audio_array + + +def write_array_to_mem(audio_data, subtype): + if isinstance(audio_data, np.ndarray): + audio_buffer = io.BytesIO() + sf.write(audio_buffer, audio_data, 44100, subtype=subtype, format="WAV") + audio_buffer.seek(0) + return audio_buffer + else: + return audio_data + + +def spectrogram_to_image(spec, mode="magnitude"): + if mode == "magnitude": + if np.iscomplexobj(spec): + y = np.abs(spec) + else: + y = spec + y = np.log10(y**2 + 1e-8) + elif mode == "phase": + if np.iscomplexobj(spec): + y = np.angle(spec) + else: + y = spec + + y -= y.min() + y *= 255 / y.max() + img = np.uint8(y) + + if y.ndim == 3: + img = img.transpose(1, 2, 0) + img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2) + + return img + + +def reduce_vocal_aggressively(X, y, softmask): + v = X - y + y_mag_tmp = np.abs(y) + v_mag_tmp = np.abs(v) + + v_mask = v_mag_tmp > y_mag_tmp + y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf) + + return y_mag * np.exp(1.0j * np.angle(y)) + + +def merge_artifacts(y_mask, thres=0.01, min_range=64, fade_size=32): + mask = y_mask + + try: + if min_range < fade_size * 2: + raise ValueError("min_range must be >= fade_size * 2") + + idx = np.where(y_mask.min(axis=(0, 1)) > thres)[0] + start_idx = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0]) + end_idx = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1]) + artifact_idx = np.where(end_idx - start_idx > min_range)[0] + weight = np.zeros_like(y_mask) + if len(artifact_idx) > 0: + start_idx = start_idx[artifact_idx] + end_idx = end_idx[artifact_idx] + old_e = None + for s, e in zip(start_idx, end_idx): + if old_e is not None and s - old_e < fade_size: + s = old_e - fade_size * 2 + + if s != 0: + weight[:, :, s : s + fade_size] = np.linspace(0, 1, fade_size) + else: + s -= fade_size + + if e != y_mask.shape[2]: + weight[:, :, e - fade_size : e] = np.linspace(1, 0, fade_size) + else: + e += fade_size + + weight[:, :, s + fade_size : e - fade_size] = 1 + old_e = e + + v_mask = 1 - y_mask + y_mask += weight * v_mask + + mask = y_mask + except Exception as e: + error_name = f"{type(e).__name__}" + traceback_text = "".join(traceback.format_tb(e.__traceback__)) + message = f'{error_name}: "{e}"\n{traceback_text}"' + print("Post Process Failed: ", message) + + return mask + + +def align_wave_head_and_tail(a, b): + l = min([a[0].size, b[0].size]) + + return a[:l, :l], b[:l, :l] + + +def convert_channels(spec, mp, band): + cc = mp.param["band"][band].get("convert_channels") + + if "mid_side_c" == cc: + spec_left = np.add(spec[0], spec[1] * 0.25) + spec_right = np.subtract(spec[1], spec[0] * 0.25) + elif "mid_side" == cc: + spec_left = np.add(spec[0], spec[1]) / 2 + spec_right = np.subtract(spec[0], spec[1]) + elif "stereo_n" == cc: + spec_left = np.add(spec[0], spec[1] * 0.25) / 0.9375 + spec_right = np.add(spec[1], spec[0] * 0.25) / 0.9375 + else: + return spec + + return np.asfortranarray([spec_left, spec_right]) + + +def combine_spectrograms(specs, mp, is_v51_model=False): + l = min([specs[i].shape[2] for i in specs]) + spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64) + offset = 0 + bands_n = len(mp.param["band"]) + + for d in range(1, bands_n + 1): + h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"] + spec_c[:, offset : offset + h, :l] = specs[d][:, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l] + offset += h + + if offset > mp.param["bins"]: + raise ValueError("Too much bins") + + # lowpass fiter + + if mp.param["pre_filter_start"] > 0: + if is_v51_model: + spec_c *= get_lp_filter_mask(spec_c.shape[1], mp.param["pre_filter_start"], mp.param["pre_filter_stop"]) + else: + if bands_n == 1: + spec_c = fft_lp_filter(spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]) + else: + gp = 1 + for b in range(mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]): + g = math.pow(10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0) + gp = g + spec_c[:, b, :] *= g + + return np.asfortranarray(spec_c) + + +def wave_to_spectrogram(wave, hop_length, n_fft, mp, band, is_v51_model=False): + + if wave.ndim == 1: + wave = np.asfortranarray([wave, wave]) + + if not is_v51_model: + if mp.param["reverse"]: + wave_left = np.flip(np.asfortranarray(wave[0])) + wave_right = np.flip(np.asfortranarray(wave[1])) + elif mp.param["mid_side"]: + wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) + wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) + elif mp.param["mid_side_b2"]: + wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5)) + wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5)) + else: + wave_left = np.asfortranarray(wave[0]) + wave_right = np.asfortranarray(wave[1]) + else: + wave_left = np.asfortranarray(wave[0]) + wave_right = np.asfortranarray(wave[1]) + + spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length) + spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length) + + spec = np.asfortranarray([spec_left, spec_right]) + + if is_v51_model: + spec = convert_channels(spec, mp, band) + + return spec + + +def spectrogram_to_wave(spec, hop_length=1024, mp={}, band=0, is_v51_model=True): + spec_left = np.asfortranarray(spec[0]) + spec_right = np.asfortranarray(spec[1]) + + wave_left = librosa.istft(spec_left, hop_length=hop_length) + wave_right = librosa.istft(spec_right, hop_length=hop_length) + + if is_v51_model: + cc = mp.param["band"][band].get("convert_channels") + if "mid_side_c" == cc: + return np.asfortranarray([np.subtract(wave_left / 1.0625, wave_right / 4.25), np.add(wave_right / 1.0625, wave_left / 4.25)]) + elif "mid_side" == cc: + return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) + elif "stereo_n" == cc: + return np.asfortranarray([np.subtract(wave_left, wave_right * 0.25), np.subtract(wave_right, wave_left * 0.25)]) + else: + if mp.param["reverse"]: + return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) + elif mp.param["mid_side"]: + return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) + elif mp.param["mid_side_b2"]: + return np.asfortranarray([np.add(wave_right / 1.25, 0.4 * wave_left), np.subtract(wave_left / 1.25, 0.4 * wave_right)]) + + return np.asfortranarray([wave_left, wave_right]) + + +def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None, is_v51_model=False): + bands_n = len(mp.param["band"]) + offset = 0 + + for d in range(1, bands_n + 1): + bp = mp.param["band"][d] + spec_s = np.ndarray(shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex) + h = bp["crop_stop"] - bp["crop_start"] + spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[:, offset : offset + h, :] + + offset += h + if d == bands_n: # higher + if extra_bins_h: # if --high_end_process bypass + max_bin = bp["n_fft"] // 2 + spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[:, :extra_bins_h, :] + if bp["hpf_start"] > 0: + if is_v51_model: + spec_s *= get_hp_filter_mask(spec_s.shape[1], bp["hpf_start"], bp["hpf_stop"] - 1) + else: + spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) + if bands_n == 1: + wave = spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model) + else: + wave = np.add(wave, spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model)) + else: + sr = mp.param["band"][d + 1]["sr"] + if d == 1: # lower + if is_v51_model: + spec_s *= get_lp_filter_mask(spec_s.shape[1], bp["lpf_start"], bp["lpf_stop"]) + else: + spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) + wave = librosa.resample(spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model), orig_sr=bp["sr"], target_sr=sr, res_type=wav_resolution) + else: # mid + if is_v51_model: + spec_s *= get_hp_filter_mask(spec_s.shape[1], bp["hpf_start"], bp["hpf_stop"] - 1) + spec_s *= get_lp_filter_mask(spec_s.shape[1], bp["lpf_start"], bp["lpf_stop"]) + else: + spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) + spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) + + wave2 = np.add(wave, spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model)) + wave = librosa.resample(wave2, orig_sr=bp["sr"], target_sr=sr, res_type=wav_resolution) + + return wave + + +def get_lp_filter_mask(n_bins, bin_start, bin_stop): + mask = np.concatenate([np.ones((bin_start - 1, 1)), np.linspace(1, 0, bin_stop - bin_start + 1)[:, None], np.zeros((n_bins - bin_stop, 1))], axis=0) + + return mask + + +def get_hp_filter_mask(n_bins, bin_start, bin_stop): + mask = np.concatenate([np.zeros((bin_stop + 1, 1)), np.linspace(0, 1, 1 + bin_start - bin_stop)[:, None], np.ones((n_bins - bin_start - 2, 1))], axis=0) + + return mask + + +def fft_lp_filter(spec, bin_start, bin_stop): + g = 1.0 + for b in range(bin_start, bin_stop): + g -= 1 / (bin_stop - bin_start) + spec[:, b, :] = g * spec[:, b, :] + + spec[:, bin_stop:, :] *= 0 + + return spec + + +def fft_hp_filter(spec, bin_start, bin_stop): + g = 1.0 + for b in range(bin_start, bin_stop, -1): + g -= 1 / (bin_start - bin_stop) + spec[:, b, :] = g * spec[:, b, :] + + spec[:, 0 : bin_stop + 1, :] *= 0 + + return spec + + +def spectrogram_to_wave_old(spec, hop_length=1024): + if spec.ndim == 2: + wave = librosa.istft(spec, hop_length=hop_length) + elif spec.ndim == 3: + spec_left = np.asfortranarray(spec[0]) + spec_right = np.asfortranarray(spec[1]) + + wave_left = librosa.istft(spec_left, hop_length=hop_length) + wave_right = librosa.istft(spec_right, hop_length=hop_length) + wave = np.asfortranarray([wave_left, wave_right]) + + return wave + + +def wave_to_spectrogram_old(wave, hop_length, n_fft): + wave_left = np.asfortranarray(wave[0]) + wave_right = np.asfortranarray(wave[1]) + + spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length) + spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length) + + spec = np.asfortranarray([spec_left, spec_right]) + + return spec + + +def mirroring(a, spec_m, input_high_end, mp): + if "mirroring" == a: + mirror = np.flip(np.abs(spec_m[:, mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :]), 1) + mirror = mirror * np.exp(1.0j * np.angle(input_high_end)) + + return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror) + + if "mirroring2" == a: + mirror = np.flip(np.abs(spec_m[:, mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :]), 1) + mi = np.multiply(mirror, input_high_end * 1.7) + + return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi) + + +def adjust_aggr(mask, is_non_accom_stem, aggressiveness): + aggr = aggressiveness["value"] * 2 + + if aggr != 0: + if is_non_accom_stem: + aggr = 1 - aggr + + aggr = [aggr, aggr] + + if aggressiveness["aggr_correction"] is not None: + aggr[0] += aggressiveness["aggr_correction"]["left"] + aggr[1] += aggressiveness["aggr_correction"]["right"] + + for ch in range(2): + mask[ch, : aggressiveness["split_bin"]] = np.power(mask[ch, : aggressiveness["split_bin"]], 1 + aggr[ch] / 3) + mask[ch, aggressiveness["split_bin"] :] = np.power(mask[ch, aggressiveness["split_bin"] :], 1 + aggr[ch]) + + return mask + + +def stft(wave, nfft, hl): + wave_left = np.asfortranarray(wave[0]) + wave_right = np.asfortranarray(wave[1]) + spec_left = librosa.stft(wave_left, n_fft=nfft, hop_length=hl) + spec_right = librosa.stft(wave_right, n_fft=nfft, hop_length=hl) + spec = np.asfortranarray([spec_left, spec_right]) + + return spec + + +def istft(spec, hl): + spec_left = np.asfortranarray(spec[0]) + spec_right = np.asfortranarray(spec[1]) + wave_left = librosa.istft(spec_left, hop_length=hl) + wave_right = librosa.istft(spec_right, hop_length=hl) + wave = np.asfortranarray([wave_left, wave_right]) + + return wave + + +def spec_effects(wave, algorithm="Default", value=None): + spec = [stft(wave[0], 2048, 1024), stft(wave[1], 2048, 1024)] + if algorithm == "Min_Mag": + v_spec_m = np.where(np.abs(spec[1]) <= np.abs(spec[0]), spec[1], spec[0]) + wave = istft(v_spec_m, 1024) + elif algorithm == "Max_Mag": + v_spec_m = np.where(np.abs(spec[1]) >= np.abs(spec[0]), spec[1], spec[0]) + wave = istft(v_spec_m, 1024) + elif algorithm == "Default": + wave = (wave[1] * value) + (wave[0] * (1 - value)) + elif algorithm == "Invert_p": + X_mag = np.abs(spec[0]) + y_mag = np.abs(spec[1]) + max_mag = np.where(X_mag >= y_mag, X_mag, y_mag) + v_spec = spec[1] - max_mag * np.exp(1.0j * np.angle(spec[0])) + wave = istft(v_spec, 1024) + + return wave + + +def spectrogram_to_wave_no_mp(spec, n_fft=2048, hop_length=1024): + wave = librosa.istft(spec, n_fft=n_fft, hop_length=hop_length) + + if wave.ndim == 1: + wave = np.asfortranarray([wave, wave]) + + return wave + + +def wave_to_spectrogram_no_mp(wave): + + spec = librosa.stft(wave, n_fft=2048, hop_length=1024) + + if spec.ndim == 1: + spec = np.asfortranarray([spec, spec]) + + return spec + + +def invert_audio(specs, invert_p=True): + + ln = min([specs[0].shape[2], specs[1].shape[2]]) + specs[0] = specs[0][:, :, :ln] + specs[1] = specs[1][:, :, :ln] + + if invert_p: + X_mag = np.abs(specs[0]) + y_mag = np.abs(specs[1]) + max_mag = np.where(X_mag >= y_mag, X_mag, y_mag) + v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0])) + else: + specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2) + v_spec = specs[0] - specs[1] + + return v_spec + + +def invert_stem(mixture, stem): + mixture = wave_to_spectrogram_no_mp(mixture) + stem = wave_to_spectrogram_no_mp(stem) + output = spectrogram_to_wave_no_mp(invert_audio([mixture, stem])) + + return -output.T + + +def ensembling(a, inputs, is_wavs=False): + + for i in range(1, len(inputs)): + if i == 1: + input = inputs[0] + + if is_wavs: + ln = min([input.shape[1], inputs[i].shape[1]]) + input = input[:, :ln] + inputs[i] = inputs[i][:, :ln] + else: + ln = min([input.shape[2], inputs[i].shape[2]]) + input = input[:, :, :ln] + inputs[i] = inputs[i][:, :, :ln] + + if MIN_SPEC == a: + input = np.where(np.abs(inputs[i]) <= np.abs(input), inputs[i], input) + if MAX_SPEC == a: + input = np.where(np.abs(inputs[i]) >= np.abs(input), inputs[i], input) + + # linear_ensemble + # input = ensemble_wav(inputs, split_size=1) + + return input + + +def ensemble_for_align(waves): + + specs = [] + + for wav in waves: + spec = wave_to_spectrogram_no_mp(wav.T) + specs.append(spec) + + wav_aligned = spectrogram_to_wave_no_mp(ensembling(MIN_SPEC, specs)).T + wav_aligned = match_array_shapes(wav_aligned, waves[1], is_swap=True) + + return wav_aligned + + +def ensemble_inputs(audio_input, algorithm, is_normalization, wav_type_set, save_path, is_wave=False, is_array=False): + + wavs_ = [] + + if algorithm == AVERAGE: + output = average_audio(audio_input) + samplerate = 44100 + else: + specs = [] + + for i in range(len(audio_input)): + wave, samplerate = librosa.load(audio_input[i], mono=False, sr=44100) + wavs_.append(wave) + spec = wave if is_wave else wave_to_spectrogram_no_mp(wave) + specs.append(spec) + + wave_shapes = [w.shape[1] for w in wavs_] + target_shape = wavs_[wave_shapes.index(max(wave_shapes))] + + if is_wave: + output = ensembling(algorithm, specs, is_wavs=True) + else: + output = spectrogram_to_wave_no_mp(ensembling(algorithm, specs)) + + output = to_shape(output, target_shape.shape) + + sf.write(save_path, normalize(output.T, is_normalization), samplerate, subtype=wav_type_set) + + +def to_shape(x, target_shape): + padding_list = [] + for x_dim, target_dim in zip(x.shape, target_shape): + pad_value = target_dim - x_dim + pad_tuple = (0, pad_value) + padding_list.append(pad_tuple) + + return np.pad(x, tuple(padding_list), mode="constant") + + +def to_shape_minimize(x: np.ndarray, target_shape): + + padding_list = [] + for x_dim, target_dim in zip(x.shape, target_shape): + pad_value = target_dim - x_dim + pad_tuple = (0, pad_value) + padding_list.append(pad_tuple) + + return np.pad(x, tuple(padding_list), mode="constant") + + +def detect_leading_silence(audio, sr, silence_threshold=0.007, frame_length=1024): + """ + Detect silence at the beginning of an audio signal. + + :param audio: np.array, audio signal + :param sr: int, sample rate + :param silence_threshold: float, magnitude threshold below which is considered silence + :param frame_length: int, the number of samples to consider for each check + + :return: float, duration of the leading silence in milliseconds + """ + + if len(audio.shape) == 2: + # If stereo, pick the channel with more energy to determine the silence + channel = np.argmax(np.sum(np.abs(audio), axis=1)) + audio = audio[channel] + + for i in range(0, len(audio), frame_length): + if np.max(np.abs(audio[i : i + frame_length])) > silence_threshold: + return (i / sr) * 1000 + + return (len(audio) / sr) * 1000 + + +def adjust_leading_silence(target_audio, reference_audio, silence_threshold=0.01, frame_length=1024): + """ + Adjust the leading silence of the target_audio to match the leading silence of the reference_audio. + + :param target_audio: np.array, audio signal that will have its silence adjusted + :param reference_audio: np.array, audio signal used as a reference + :param sr: int, sample rate + :param silence_threshold: float, magnitude threshold below which is considered silence + :param frame_length: int, the number of samples to consider for each check + + :return: np.array, target_audio adjusted to have the same leading silence as reference_audio + """ + + def find_silence_end(audio): + if len(audio.shape) == 2: + # If stereo, pick the channel with more energy to determine the silence + channel = np.argmax(np.sum(np.abs(audio), axis=1)) + audio_mono = audio[channel] + else: + audio_mono = audio + + for i in range(0, len(audio_mono), frame_length): + if np.max(np.abs(audio_mono[i : i + frame_length])) > silence_threshold: + return i + return len(audio_mono) + + ref_silence_end = find_silence_end(reference_audio) + target_silence_end = find_silence_end(target_audio) + silence_difference = ref_silence_end - target_silence_end + + try: + ref_silence_end_p = (ref_silence_end / 44100) * 1000 + target_silence_end_p = (target_silence_end / 44100) * 1000 + silence_difference_p = ref_silence_end_p - target_silence_end_p + print("silence_difference: ", silence_difference_p) + except Exception as e: + pass + + if silence_difference > 0: # Add silence to target_audio + if len(target_audio.shape) == 2: # stereo + silence_to_add = np.zeros((target_audio.shape[0], silence_difference)) + else: # mono + silence_to_add = np.zeros(silence_difference) + return np.hstack((silence_to_add, target_audio)) + elif silence_difference < 0: # Remove silence from target_audio + if len(target_audio.shape) == 2: # stereo + return target_audio[:, -silence_difference:] + else: # mono + return target_audio[-silence_difference:] + else: # No adjustment needed + return target_audio + + +def match_array_shapes(array_1: np.ndarray, array_2: np.ndarray, is_swap=False): + + if is_swap: + array_1, array_2 = array_1.T, array_2.T + + # print("before", array_1.shape, array_2.shape) + if array_1.shape[1] > array_2.shape[1]: + array_1 = array_1[:, : array_2.shape[1]] + elif array_1.shape[1] < array_2.shape[1]: + padding = array_2.shape[1] - array_1.shape[1] + array_1 = np.pad(array_1, ((0, 0), (0, padding)), "constant", constant_values=0) + + # print("after", array_1.shape, array_2.shape) + + if is_swap: + array_1, array_2 = array_1.T, array_2.T + + return array_1 + + +def match_mono_array_shapes(array_1: np.ndarray, array_2: np.ndarray): + + if len(array_1) > len(array_2): + array_1 = array_1[: len(array_2)] + elif len(array_1) < len(array_2): + padding = len(array_2) - len(array_1) + array_1 = np.pad(array_1, (0, padding), "constant", constant_values=0) + + return array_1 + + +def change_pitch_semitones(y, sr, semitone_shift): + factor = 2 ** (semitone_shift / 12) # Convert semitone shift to factor for resampling + y_pitch_tuned = [] + for y_channel in y: + y_pitch_tuned.append(librosa.resample(y_channel, orig_sr=sr, target_sr=sr * factor, res_type=wav_resolution_float_resampling)) + y_pitch_tuned = np.array(y_pitch_tuned) + new_sr = sr * factor + return y_pitch_tuned, new_sr + + +def augment_audio(export_path, audio_file, rate, is_normalization, wav_type_set, save_format=None, is_pitch=False, is_time_correction=True): + + wav, sr = librosa.load(audio_file, sr=44100, mono=False) + + if wav.ndim == 1: + wav = np.asfortranarray([wav, wav]) + + if not is_time_correction: + wav_mix = change_pitch_semitones(wav, 44100, semitone_shift=-rate)[0] + else: + if is_pitch: + wav_1 = pyrb.pitch_shift(wav[0], sr, rate, rbargs=None) + wav_2 = pyrb.pitch_shift(wav[1], sr, rate, rbargs=None) + else: + wav_1 = pyrb.time_stretch(wav[0], sr, rate, rbargs=None) + wav_2 = pyrb.time_stretch(wav[1], sr, rate, rbargs=None) + + if wav_1.shape > wav_2.shape: + wav_2 = to_shape(wav_2, wav_1.shape) + if wav_1.shape < wav_2.shape: + wav_1 = to_shape(wav_1, wav_2.shape) + + wav_mix = np.asfortranarray([wav_1, wav_2]) + + sf.write(export_path, normalize(wav_mix.T, is_normalization), sr, subtype=wav_type_set) + save_format(export_path) + + +def average_audio(audio): + + waves = [] + wave_shapes = [] + final_waves = [] + + for i in range(len(audio)): + wave = librosa.load(audio[i], sr=44100, mono=False) + waves.append(wave[0]) + wave_shapes.append(wave[0].shape[1]) + + wave_shapes_index = wave_shapes.index(max(wave_shapes)) + target_shape = waves[wave_shapes_index] + waves.pop(wave_shapes_index) + final_waves.append(target_shape) + + for n_array in waves: + wav_target = to_shape(n_array, target_shape.shape) + final_waves.append(wav_target) + + waves = sum(final_waves) + waves = waves / len(audio) + + return waves + + +def average_dual_sources(wav_1, wav_2, value): + + if wav_1.shape > wav_2.shape: + wav_2 = to_shape(wav_2, wav_1.shape) + if wav_1.shape < wav_2.shape: + wav_1 = to_shape(wav_1, wav_2.shape) + + wave = (wav_1 * value) + (wav_2 * (1 - value)) + + return wave + + +def reshape_sources(wav_1: np.ndarray, wav_2: np.ndarray): + + if wav_1.shape > wav_2.shape: + wav_2 = to_shape(wav_2, wav_1.shape) + if wav_1.shape < wav_2.shape: + ln = min([wav_1.shape[1], wav_2.shape[1]]) + wav_2 = wav_2[:, :ln] + + ln = min([wav_1.shape[1], wav_2.shape[1]]) + wav_1 = wav_1[:, :ln] + wav_2 = wav_2[:, :ln] + + return wav_2 + + +def reshape_sources_ref(wav_1_shape, wav_2: np.ndarray): + + if wav_1_shape > wav_2.shape: + wav_2 = to_shape(wav_2, wav_1_shape) + + return wav_2 + + +def combine_arrarys(audio_sources, is_swap=False): + source = np.zeros_like(max(audio_sources, key=np.size)) + + for v in audio_sources: + v = match_array_shapes(v, source, is_swap=is_swap) + source += v + + return source + + +def combine_audio(paths: list, audio_file_base=None, wav_type_set="FLOAT", save_format=None): + + source = combine_arrarys([load_audio(i) for i in paths]) + save_path = f"{audio_file_base}_combined.wav" + sf.write(save_path, source.T, 44100, subtype=wav_type_set) + save_format(save_path) + + +def reduce_mix_bv(inst_source, voc_source, reduction_rate=0.9): + # Reduce the volume + inst_source = inst_source * (1 - reduction_rate) + + mix_reduced = combine_arrarys([inst_source, voc_source], is_swap=True) + + return mix_reduced + + +def organize_inputs(inputs): + input_list = {"target": None, "reference": None, "reverb": None, "inst": None} + + for i in inputs: + if i.endswith("_(Vocals).wav"): + input_list["reference"] = i + elif "_RVC_" in i: + input_list["target"] = i + elif i.endswith("reverbed_stem.wav"): + input_list["reverb"] = i + elif i.endswith("_(Instrumental).wav"): + input_list["inst"] = i + + return input_list + + +def check_if_phase_inverted(wav1, wav2, is_mono=False): + # Load the audio files + if not is_mono: + wav1 = np.mean(wav1, axis=0) + wav2 = np.mean(wav2, axis=0) + + # Compute the correlation + correlation = np.corrcoef(wav1[:1000], wav2[:1000]) + + return correlation[0, 1] < 0 + + +def align_audio( + file1, + file2, + file2_aligned, + file_subtracted, + wav_type_set, + is_save_aligned, + command_Text, + save_format, + align_window: list, + align_intro_val: list, + db_analysis: tuple, + set_progress_bar, + phase_option, + phase_shifts, + is_match_silence, + is_spec_match, +): + + global progress_value + progress_value = 0 + is_mono = False + + def get_diff(a, b): + corr = np.correlate(a, b, "full") + diff = corr.argmax() - (b.shape[0] - 1) + + return diff + + def progress_bar(length): + global progress_value + progress_value += 1 + + if (0.90 / length * progress_value) >= 0.9: + length = progress_value + 1 + + set_progress_bar(0.1, (0.9 / length * progress_value)) + + # read tracks + + if file1.endswith(".mp3") and is_macos: + length1 = rerun_mp3(file1) + wav1, sr1 = librosa.load(file1, duration=length1, sr=44100, mono=False) + else: + wav1, sr1 = librosa.load(file1, sr=44100, mono=False) + + if file2.endswith(".mp3") and is_macos: + length2 = rerun_mp3(file2) + wav2, sr2 = librosa.load(file2, duration=length2, sr=44100, mono=False) + else: + wav2, sr2 = librosa.load(file2, sr=44100, mono=False) + + if wav1.ndim == 1 and wav2.ndim == 1: + is_mono = True + elif wav1.ndim == 1: + wav1 = np.asfortranarray([wav1, wav1]) + elif wav2.ndim == 1: + wav2 = np.asfortranarray([wav2, wav2]) + + # Check if phase is inverted + if phase_option == AUTO_PHASE: + if check_if_phase_inverted(wav1, wav2, is_mono=is_mono): + wav2 = -wav2 + elif phase_option == POSITIVE_PHASE: + wav2 = +wav2 + elif phase_option == NEGATIVE_PHASE: + wav2 = -wav2 + + if is_match_silence: + wav2 = adjust_leading_silence(wav2, wav1) + + wav1_length = int(librosa.get_duration(y=wav1, sr=44100)) + wav2_length = int(librosa.get_duration(y=wav2, sr=44100)) + + if not is_mono: + wav1 = wav1.transpose() + wav2 = wav2.transpose() + + wav2_org = wav2.copy() + + command_Text("Processing files... \n") + seconds_length = min(wav1_length, wav2_length) + + wav2_aligned_sources = [] + + for sec_len in align_intro_val: + # pick a position at 1 second in and get diff + sec_seg = 1 if sec_len == 1 else int(seconds_length // sec_len) + index = sr1 * sec_seg # 1 second in, assuming sr1 = sr2 = 44100 + + if is_mono: + samp1, samp2 = wav1[index : index + sr1], wav2[index : index + sr1] + diff = get_diff(samp1, samp2) + # print(f"Estimated difference: {diff}\n") + else: + index = sr1 * sec_seg # 1 second in, assuming sr1 = sr2 = 44100 + samp1, samp2 = wav1[index : index + sr1, 0], wav2[index : index + sr1, 0] + samp1_r, samp2_r = wav1[index : index + sr1, 1], wav2[index : index + sr1, 1] + diff, diff_r = get_diff(samp1, samp2), get_diff(samp1_r, samp2_r) + # print(f"Estimated difference Left Channel: {diff}\nEstimated difference Right Channel: {diff_r}\n") + + # make aligned track 2 + if diff > 0: + zeros_to_append = np.zeros(diff) if is_mono else np.zeros((diff, 2)) + wav2_aligned = np.append(zeros_to_append, wav2_org, axis=0) + elif diff < 0: + wav2_aligned = wav2_org[-diff:] + else: + wav2_aligned = wav2_org + # command_Text(f"Audio files already aligned.\n") + + if not any(np.array_equal(wav2_aligned, source) for source in wav2_aligned_sources): + wav2_aligned_sources.append(wav2_aligned) + + # print("Unique Sources: ", len(wav2_aligned_sources)) + + unique_sources = len(wav2_aligned_sources) + + sub_mapper_big_mapper = {} + + for s in wav2_aligned_sources: + wav2_aligned = match_mono_array_shapes(s, wav1) if is_mono else match_array_shapes(s, wav1, is_swap=True) + + if align_window: + wav_sub = time_correction( + wav1, wav2_aligned, seconds_length, align_window=align_window, db_analysis=db_analysis, progress_bar=progress_bar, unique_sources=unique_sources, phase_shifts=phase_shifts + ) + wav_sub_size = np.abs(wav_sub).mean() + sub_mapper_big_mapper = {**sub_mapper_big_mapper, **{wav_sub_size: wav_sub}} + else: + wav2_aligned = wav2_aligned * np.power(10, db_analysis[0] / 20) + db_range = db_analysis[1] + + for db_adjustment in db_range: + # Adjust the dB of track2 + s_adjusted = wav2_aligned * (10 ** (db_adjustment / 20)) + wav_sub = wav1 - s_adjusted + wav_sub_size = np.abs(wav_sub).mean() + sub_mapper_big_mapper = {**sub_mapper_big_mapper, **{wav_sub_size: wav_sub}} + + # print(sub_mapper_big_mapper.keys(), min(sub_mapper_big_mapper.keys())) + + sub_mapper_value_list = list(sub_mapper_big_mapper.values()) + + if is_spec_match and len(sub_mapper_value_list) >= 2: + # print("using spec ensemble with align") + wav_sub = ensemble_for_align(list(sub_mapper_big_mapper.values())) + else: + # print("using linear ensemble with align") + wav_sub = ensemble_wav(list(sub_mapper_big_mapper.values())) + + # print(f"Mix Mean: {np.abs(wav1).mean()}\nInst Mean: {np.abs(wav2).mean()}") + # print('Final: ', np.abs(wav_sub).mean()) + wav_sub = np.clip(wav_sub, -1, +1) + + command_Text(f"Saving inverted track... ") + + if is_save_aligned or is_spec_match: + wav1 = match_mono_array_shapes(wav1, wav_sub) if is_mono else match_array_shapes(wav1, wav_sub, is_swap=True) + wav2_aligned = wav1 - wav_sub + + if is_spec_match: + if wav1.ndim == 1 and wav2.ndim == 1: + wav2_aligned = np.asfortranarray([wav2_aligned, wav2_aligned]).T + wav1 = np.asfortranarray([wav1, wav1]).T + + wav2_aligned = ensemble_for_align([wav2_aligned, wav1]) + wav_sub = wav1 - wav2_aligned + + if is_save_aligned: + sf.write(file2_aligned, wav2_aligned, sr1, subtype=wav_type_set) + save_format(file2_aligned) + + sf.write(file_subtracted, wav_sub, sr1, subtype=wav_type_set) + save_format(file_subtracted) + + +def phase_shift_hilbert(signal, degree): + analytic_signal = hilbert(signal) + return np.cos(np.radians(degree)) * analytic_signal.real - np.sin(np.radians(degree)) * analytic_signal.imag + + +def get_phase_shifted_tracks(track, phase_shift): + if phase_shift == 180: + return [track, -track] + + step = phase_shift + end = 180 - (180 % step) if 180 % step == 0 else 181 + phase_range = range(step, end, step) + + flipped_list = [track, -track] + for i in phase_range: + flipped_list.extend([phase_shift_hilbert(track, i), phase_shift_hilbert(track, -i)]) + + return flipped_list + + +def time_correction(mix: np.ndarray, instrumental: np.ndarray, seconds_length, align_window, db_analysis, sr=44100, progress_bar=None, unique_sources=None, phase_shifts=NONE_P): + # Function to align two tracks using cross-correlation + + def align_tracks(track1, track2): + # A dictionary to store each version of track2_shifted and its mean absolute value + shifted_tracks = {} + + # Loop to adjust dB of track2 + track2 = track2 * np.power(10, db_analysis[0] / 20) + db_range = db_analysis[1] + + if phase_shifts == 190: + track2_flipped = [track2] + else: + track2_flipped = get_phase_shifted_tracks(track2, phase_shifts) + + for db_adjustment in db_range: + for t in track2_flipped: + # Adjust the dB of track2 + track2_adjusted = t * (10 ** (db_adjustment / 20)) + corr = correlate(track1, track2_adjusted) + delay = np.argmax(np.abs(corr)) - (len(track1) - 1) + track2_shifted = np.roll(track2_adjusted, shift=delay) + + # Compute the mean absolute value of track2_shifted + track2_shifted_sub = track1 - track2_shifted + mean_abs_value = np.abs(track2_shifted_sub).mean() + + # Store track2_shifted and its mean absolute value in the dictionary + shifted_tracks[mean_abs_value] = track2_shifted + + # Return the version of track2_shifted with the smallest mean absolute value + + return shifted_tracks[min(shifted_tracks.keys())] + + # Make sure the audio files have the same shape + + assert mix.shape == instrumental.shape, f"Audio files must have the same shape - Mix: {mix.shape}, Inst: {instrumental.shape}" + + seconds_length = seconds_length // 2 + + sub_mapper = {} + + progress_update_interval = 120 + total_iterations = 0 + + if len(align_window) > 2: + progress_update_interval = 320 + + for secs in align_window: + step = secs / 2 + window_size = int(sr * secs) + step_size = int(sr * step) + + if len(mix.shape) == 1: + total_mono = (len(range(0, len(mix) - window_size, step_size)) // progress_update_interval) * unique_sources + total_iterations += total_mono + else: + total_stereo_ = len(range(0, len(mix[:, 0]) - window_size, step_size)) * 2 + total_stereo = (total_stereo_ // progress_update_interval) * unique_sources + total_iterations += total_stereo + + # print(total_iterations) + + for secs in align_window: + sub = np.zeros_like(mix) + divider = np.zeros_like(mix) + step = secs / 2 + window_size = int(sr * secs) + step_size = int(sr * step) + window = np.hanning(window_size) + + # For the mono case: + if len(mix.shape) == 1: + # The files are mono + counter = 0 + for i in range(0, len(mix) - window_size, step_size): + counter += 1 + if counter % progress_update_interval == 0: + progress_bar(total_iterations) + window_mix = mix[i : i + window_size] * window + window_instrumental = instrumental[i : i + window_size] * window + window_instrumental_aligned = align_tracks(window_mix, window_instrumental) + sub[i : i + window_size] += window_mix - window_instrumental_aligned + divider[i : i + window_size] += window + else: + # The files are stereo + counter = 0 + for ch in range(mix.shape[1]): + for i in range(0, len(mix[:, ch]) - window_size, step_size): + counter += 1 + if counter % progress_update_interval == 0: + progress_bar(total_iterations) + window_mix = mix[i : i + window_size, ch] * window + window_instrumental = instrumental[i : i + window_size, ch] * window + window_instrumental_aligned = align_tracks(window_mix, window_instrumental) + sub[i : i + window_size, ch] += window_mix - window_instrumental_aligned + divider[i : i + window_size, ch] += window + + # Normalize the result by the overlap count + sub = np.where(divider > 1e-6, sub / divider, sub) + sub_size = np.abs(sub).mean() + sub_mapper = {**sub_mapper, **{sub_size: sub}} + + # print("SUB_LEN", len(list(sub_mapper.values()))) + + sub = ensemble_wav(list(sub_mapper.values()), split_size=12) + + return sub + + +def ensemble_wav(waveforms, split_size=240): + # Create a dictionary to hold the thirds of each waveform and their mean absolute values + waveform_thirds = {i: np.array_split(waveform, split_size) for i, waveform in enumerate(waveforms)} + + # Initialize the final waveform + final_waveform = [] + + # For chunk + for third_idx in range(split_size): + # Compute the mean absolute value of each third from each waveform + means = [np.abs(waveform_thirds[i][third_idx]).mean() for i in range(len(waveforms))] + + # Find the index of the waveform with the lowest mean absolute value for this third + min_index = np.argmin(means) + + # Add the least noisy third to the final waveform + final_waveform.append(waveform_thirds[min_index][third_idx]) + + # Concatenate all the thirds to create the final waveform + final_waveform = np.concatenate(final_waveform) + + return final_waveform + + +def ensemble_wav_min(waveforms): + for i in range(1, len(waveforms)): + if i == 1: + wave = waveforms[0] + + ln = min(len(wave), len(waveforms[i])) + wave = wave[:ln] + waveforms[i] = waveforms[i][:ln] + + wave = np.where(np.abs(waveforms[i]) <= np.abs(wave), waveforms[i], wave) + + return wave + + +def align_audio_test(wav1, wav2, sr1=44100): + def get_diff(a, b): + corr = np.correlate(a, b, "full") + diff = corr.argmax() - (b.shape[0] - 1) + return diff + + # read tracks + wav1 = wav1.transpose() + wav2 = wav2.transpose() + + # print(f"Audio file shapes: {wav1.shape} / {wav2.shape}\n") + + wav2_org = wav2.copy() + + # pick a position at 1 second in and get diff + index = sr1 # *seconds_length # 1 second in, assuming sr1 = sr2 = 44100 + samp1 = wav1[index : index + sr1, 0] # currently use left channel + samp2 = wav2[index : index + sr1, 0] + diff = get_diff(samp1, samp2) + + # make aligned track 2 + if diff > 0: + wav2_aligned = np.append(np.zeros((diff, 1)), wav2_org, axis=0) + elif diff < 0: + wav2_aligned = wav2_org[-diff:] + else: + wav2_aligned = wav2_org + + return wav2_aligned + + +def load_audio(audio_file): + wav, sr = librosa.load(audio_file, sr=44100, mono=False) + + if wav.ndim == 1: + wav = np.asfortranarray([wav, wav]) + + return wav + + +def rerun_mp3(audio_file): + with audioread.audio_open(audio_file) as f: + track_length = int(f.duration) + + return track_length diff --git a/audio_separator/separator/stft.py b/audio_separator/separator/uvr_lib_v5/stft.py similarity index 93% rename from audio_separator/separator/stft.py rename to audio_separator/separator/uvr_lib_v5/stft.py index c1dd2c3..f440395 100644 --- a/audio_separator/separator/stft.py +++ b/audio_separator/separator/uvr_lib_v5/stft.py @@ -1,10 +1,13 @@ import torch -# These functions perform the Short-Time Fourier Transform (stft) and its inverse (istft). -# They are essential for converting the audio between the time domain and the frequency domain, -# which is a crucial aspect of audio processing in neural networks. class STFT: + """ + This class performs the Short-Time Fourier Transform (STFT) and its inverse (ISTFT). + These functions are essential for converting the audio between the time domain and the frequency domain, + which is a crucial aspect of audio processing in neural networks. + """ + def __init__(self, logger, n_fft, hop_length, dim_f, device): self.logger = logger self.n_fft = n_fft @@ -35,9 +38,7 @@ def __call__(self, input_tensor): reshaped_tensor = input_tensor.reshape([-1, time_dim]) # Perform the Short-Time Fourier Transform (STFT) on the reshaped tensor. - stft_output = torch.stft( - reshaped_tensor, n_fft=self.n_fft, hop_length=self.hop_length, window=stft_window, center=True, return_complex=False - ) + stft_output = torch.stft(reshaped_tensor, n_fft=self.n_fft, hop_length=self.hop_length, window=stft_window, center=True, return_complex=False) # Rearrange the dimensions of the STFT output to bring the frequency dimension forward. permuted_stft_output = stft_output.permute([0, 3, 1, 2]) diff --git a/audio_separator/separator/uvr_lib_v5/tfc_tdf_v3.py b/audio_separator/separator/uvr_lib_v5/tfc_tdf_v3.py new file mode 100644 index 0000000..eba006c --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/tfc_tdf_v3.py @@ -0,0 +1,253 @@ +import torch +import torch.nn as nn +from functools import partial + +class STFT: + def __init__(self, n_fft, hop_length, dim_f, device): + self.n_fft = n_fft + self.hop_length = hop_length + self.window = torch.hann_window(window_length=self.n_fft, periodic=True) + self.dim_f = dim_f + self.device = device + + def __call__(self, x): + + x_is_mps = not x.device.type in ["cuda", "cpu"] + if x_is_mps: + x = x.cpu() + + window = self.window.to(x.device) + batch_dims = x.shape[:-2] + c, t = x.shape[-2:] + x = x.reshape([-1, t]) + x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length, window=window, center=True,return_complex=False) + x = x.permute([0, 3, 1, 2]) + x = x.reshape([*batch_dims, c, 2, -1, x.shape[-1]]).reshape([*batch_dims, c * 2, -1, x.shape[-1]]) + + if x_is_mps: + x = x.to(self.device) + + return x[..., :self.dim_f, :] + + def inverse(self, x): + + x_is_mps = not x.device.type in ["cuda", "cpu"] + if x_is_mps: + x = x.cpu() + + window = self.window.to(x.device) + batch_dims = x.shape[:-3] + c, f, t = x.shape[-3:] + n = self.n_fft // 2 + 1 + f_pad = torch.zeros([*batch_dims, c, n - f, t]).to(x.device) + x = torch.cat([x, f_pad], -2) + x = x.reshape([*batch_dims, c // 2, 2, n, t]).reshape([-1, 2, n, t]) + x = x.permute([0, 2, 3, 1]) + x = x[..., 0] + x[..., 1] * 1.j + x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop_length, window=window, center=True) + x = x.reshape([*batch_dims, 2, -1]) + + if x_is_mps: + x = x.to(self.device) + + return x + +def get_norm(norm_type): + def norm(c, norm_type): + if norm_type == 'BatchNorm': + return nn.BatchNorm2d(c) + elif norm_type == 'InstanceNorm': + return nn.InstanceNorm2d(c, affine=True) + elif 'GroupNorm' in norm_type: + g = int(norm_type.replace('GroupNorm', '')) + return nn.GroupNorm(num_groups=g, num_channels=c) + else: + return nn.Identity() + + return partial(norm, norm_type=norm_type) + + +def get_act(act_type): + if act_type == 'gelu': + return nn.GELU() + elif act_type == 'relu': + return nn.ReLU() + elif act_type[:3] == 'elu': + alpha = float(act_type.replace('elu', '')) + return nn.ELU(alpha) + else: + raise Exception + + +class Upscale(nn.Module): + def __init__(self, in_c, out_c, scale, norm, act): + super().__init__() + self.conv = nn.Sequential( + norm(in_c), + act, + nn.ConvTranspose2d(in_channels=in_c, out_channels=out_c, kernel_size=scale, stride=scale, bias=False) + ) + + def forward(self, x): + return self.conv(x) + + +class Downscale(nn.Module): + def __init__(self, in_c, out_c, scale, norm, act): + super().__init__() + self.conv = nn.Sequential( + norm(in_c), + act, + nn.Conv2d(in_channels=in_c, out_channels=out_c, kernel_size=scale, stride=scale, bias=False) + ) + + def forward(self, x): + return self.conv(x) + + +class TFC_TDF(nn.Module): + def __init__(self, in_c, c, l, f, bn, norm, act): + super().__init__() + + self.blocks = nn.ModuleList() + for i in range(l): + block = nn.Module() + + block.tfc1 = nn.Sequential( + norm(in_c), + act, + nn.Conv2d(in_c, c, 3, 1, 1, bias=False), + ) + block.tdf = nn.Sequential( + norm(c), + act, + nn.Linear(f, f // bn, bias=False), + norm(c), + act, + nn.Linear(f // bn, f, bias=False), + ) + block.tfc2 = nn.Sequential( + norm(c), + act, + nn.Conv2d(c, c, 3, 1, 1, bias=False), + ) + block.shortcut = nn.Conv2d(in_c, c, 1, 1, 0, bias=False) + + self.blocks.append(block) + in_c = c + + def forward(self, x): + for block in self.blocks: + s = block.shortcut(x) + x = block.tfc1(x) + x = x + block.tdf(x) + x = block.tfc2(x) + x = x + s + return x + + +class TFC_TDF_net(nn.Module): + def __init__(self, config, device): + super().__init__() + self.config = config + self.device = device + + norm = get_norm(norm_type=config.model.norm) + act = get_act(act_type=config.model.act) + + self.num_target_instruments = 1 if config.training.target_instrument else len(config.training.instruments) + self.num_subbands = config.model.num_subbands + + dim_c = self.num_subbands * config.audio.num_channels * 2 + n = config.model.num_scales + scale = config.model.scale + l = config.model.num_blocks_per_scale + c = config.model.num_channels + g = config.model.growth + bn = config.model.bottleneck_factor + f = config.audio.dim_f // self.num_subbands + + self.first_conv = nn.Conv2d(dim_c, c, 1, 1, 0, bias=False) + + self.encoder_blocks = nn.ModuleList() + for i in range(n): + block = nn.Module() + block.tfc_tdf = TFC_TDF(c, c, l, f, bn, norm, act) + block.downscale = Downscale(c, c + g, scale, norm, act) + f = f // scale[1] + c += g + self.encoder_blocks.append(block) + + self.bottleneck_block = TFC_TDF(c, c, l, f, bn, norm, act) + + self.decoder_blocks = nn.ModuleList() + for i in range(n): + block = nn.Module() + block.upscale = Upscale(c, c - g, scale, norm, act) + f = f * scale[1] + c -= g + block.tfc_tdf = TFC_TDF(2 * c, c, l, f, bn, norm, act) + self.decoder_blocks.append(block) + + self.final_conv = nn.Sequential( + nn.Conv2d(c + dim_c, c, 1, 1, 0, bias=False), + act, + nn.Conv2d(c, self.num_target_instruments * dim_c, 1, 1, 0, bias=False) + ) + + self.stft = STFT(config.audio.n_fft, config.audio.hop_length, config.audio.dim_f, self.device) + + def cac2cws(self, x): + k = self.num_subbands + b, c, f, t = x.shape + x = x.reshape(b, c, k, f // k, t) + x = x.reshape(b, c * k, f // k, t) + return x + + def cws2cac(self, x): + k = self.num_subbands + b, c, f, t = x.shape + x = x.reshape(b, c // k, k, f, t) + x = x.reshape(b, c // k, f * k, t) + return x + + def forward(self, x): + + x = self.stft(x) + + mix = x = self.cac2cws(x) + + first_conv_out = x = self.first_conv(x) + + x = x.transpose(-1, -2) + + encoder_outputs = [] + for block in self.encoder_blocks: + x = block.tfc_tdf(x) + encoder_outputs.append(x) + x = block.downscale(x) + + x = self.bottleneck_block(x) + + for block in self.decoder_blocks: + x = block.upscale(x) + x = torch.cat([x, encoder_outputs.pop()], 1) + x = block.tfc_tdf(x) + + x = x.transpose(-1, -2) + + x = x * first_conv_out # reduce artifacts + + x = self.final_conv(torch.cat([mix, x], 1)) + + x = self.cws2cac(x) + + if self.num_target_instruments > 1: + b, c, f, t = x.shape + x = x.reshape(b, self.num_target_instruments, -1, f, t) + + x = self.stft.inverse(x) + + return x + + diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/__init__.py b/audio_separator/separator/uvr_lib_v5/vr_network/__init__.py new file mode 100644 index 0000000..361b708 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/__init__.py @@ -0,0 +1 @@ +# VR init. diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/layers.py b/audio_separator/separator/uvr_lib_v5/vr_network/layers.py new file mode 100644 index 0000000..7526447 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/layers.py @@ -0,0 +1,294 @@ +import torch +from torch import nn +import torch.nn.functional as F + +from audio_separator.separator.uvr_lib_v5 import spec_utils + + +class Conv2DBNActiv(nn.Module): + """ + This class implements a convolutional layer followed by batch normalization and an activation function. + It is a common pattern in deep learning for processing images or feature maps. The convolutional layer + applies a set of learnable filters to the input. Batch normalization then normalizes the output of the + convolution, and finally, an activation function introduces non-linearity to the model, allowing it to + learn more complex patterns. + + Attributes: + conv (nn.Sequential): A sequential container of Conv2d, BatchNorm2d, and an activation layer. + + Args: + num_input_channels (int): Number of input channels. + num_output_channels (int): Number of output channels. + kernel_size (int, optional): Size of the kernel. Defaults to 3. + stride_length (int, optional): Stride of the convolution. Defaults to 1. + padding_size (int, optional): Padding added to all sides of the input. Defaults to 1. + dilation_rate (int, optional): Spacing between kernel elements. Defaults to 1. + activation_function (callable, optional): The activation function to use. Defaults to nn.ReLU. + """ + + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(Conv2DBNActiv, self).__init__() + + # The nn.Sequential container allows us to stack the Conv2d, BatchNorm2d, and activation layers + # into a single module, simplifying the forward pass. + self.conv = nn.Sequential(nn.Conv2d(nin, nout, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, bias=False), nn.BatchNorm2d(nout), activ()) + + def __call__(self, input_tensor): + # Defines the computation performed at every call. + # Simply passes the input through the sequential container. + return self.conv(input_tensor) + + +class SeperableConv2DBNActiv(nn.Module): + """ + This class implements a separable convolutional layer followed by batch normalization and an activation function. + Separable convolutions are a type of convolution that splits the convolution operation into two simpler operations: + a depthwise convolution and a pointwise convolution. This can reduce the number of parameters and computational cost, + making the network more efficient while maintaining similar performance. + + The depthwise convolution applies a single filter per input channel (input depth). The pointwise convolution, + which follows, applies a 1x1 convolution to combine the outputs of the depthwise convolution across channels. + Batch normalization is then applied to stabilize learning and reduce internal covariate shift. Finally, + an activation function introduces non-linearity, allowing the network to learn complex patterns. + Attributes: + conv (nn.Sequential): A sequential container of depthwise Conv2d, pointwise Conv2d, BatchNorm2d, and an activation layer. + + Args: + num_input_channels (int): Number of input channels. + num_output_channels (int): Number of output channels. + kernel_size (int, optional): Size of the kernel for the depthwise convolution. Defaults to 3. + stride_length (int, optional): Stride of the convolution. Defaults to 1. + padding_size (int, optional): Padding added to all sides of the input for the depthwise convolution. Defaults to 1. + dilation_rate (int, optional): Spacing between kernel elements for the depthwise convolution. Defaults to 1. + activation_function (callable, optional): The activation function to use. Defaults to nn.ReLU. + """ + + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(SeperableConv2DBNActiv, self).__init__() + + # Initialize the sequential container with the depthwise convolution. + # The number of groups in the depthwise convolution is set to num_input_channels, which means each input channel is treated separately. + # The pointwise convolution then combines these separate channels into num_output_channels channels. + # Batch normalization is applied to the output of the pointwise convolution. + # Finally, the activation function is applied to introduce non-linearity. + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nin, # For depthwise convolution, in_channels = out_channels = num_input_channels + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + groups=nin, # This makes it a depthwise convolution + bias=False, # Bias is not used because it will be handled by BatchNorm2d + ), + nn.Conv2d( + nin, + nout, # Pointwise convolution to combine channels + kernel_size=1, # Kernel size of 1 for pointwise convolution + bias=False, # Bias is not used because it will be handled by BatchNorm2d + ), + nn.BatchNorm2d(nout), # Normalize the output of the pointwise convolution + activ(), # Apply the activation function + ) + + def __call__(self, input_tensor): + # Pass the input through the sequential container. + # This performs the depthwise convolution, followed by the pointwise convolution, + # batch normalization, and finally applies the activation function. + return self.conv(input_tensor) + + +class Encoder(nn.Module): + """ + The Encoder class is a part of the neural network architecture that is responsible for processing the input data. + It consists of two convolutional layers, each followed by batch normalization and an activation function. + The purpose of the Encoder is to transform the input data into a higher-level, abstract representation. + This is achieved by applying filters (through convolutions) that can capture patterns or features in the data. + The Encoder can be thought of as a feature extractor that prepares the data for further processing by the network. + Attributes: + conv1 (Conv2DBNActiv): The first convolutional layer in the encoder. + conv2 (Conv2DBNActiv): The second convolutional layer in the encoder. + + Args: + number_of_input_channels (int): Number of input channels for the first convolutional layer. + number_of_output_channels (int): Number of output channels for the convolutional layers. + kernel_size (int): Kernel size for the convolutional layers. + stride_length (int): Stride for the convolutional operations. + padding_size (int): Padding added to all sides of the input for the convolutional layers. + activation_function (callable): The activation function to use after each convolutional layer. + """ + + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): + super(Encoder, self).__init__() + + # The first convolutional layer takes the input and applies a convolution, + # followed by batch normalization and an activation function specified by `activation_function`. + # This layer is responsible for capturing the initial set of features from the input data. + self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + + # The second convolutional layer further processes the output from the first layer, + # applying another set of convolution, batch normalization, and activation. + # This layer helps in capturing more complex patterns in the data by building upon the initial features extracted by conv1. + self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) + + def __call__(self, input_tensor): + # The input data `input_tensor` is passed through the first convolutional layer. + # The output of this layer serves as a 'skip connection' that can be used later in the network to preserve spatial information. + skip = self.conv1(input_tensor) + + # The output from the first layer is then passed through the second convolutional layer. + # This processed data `hidden` is the final output of the Encoder, representing the abstracted features of the input. + hidden = self.conv2(skip) + + # The Encoder returns two outputs: `hidden`, the abstracted feature representation, and `skip`, the intermediate representation from conv1. + return hidden, skip + + +class Decoder(nn.Module): + """ + The Decoder class is part of the neural network architecture, specifically designed to perform the inverse operation of an encoder. + Its main role is to reconstruct or generate data from encoded representations, which is crucial in tasks like image segmentation or audio processing. + This class uses upsampling, convolution, optional dropout for regularization, and concatenation of skip connections to achieve its goal. + + Attributes: + convolution (Conv2DBNActiv): A convolutional layer with batch normalization and activation function. + dropout_layer (nn.Dropout2d): An optional dropout layer for regularization to prevent overfitting. + + Args: + input_channels (int): Number of input channels for the convolutional layer. + output_channels (int): Number of output channels for the convolutional layer. + kernel_size (int): Kernel size for the convolutional layer. + stride (int): Stride for the convolutional operations. + padding (int): Padding added to all sides of the input for the convolutional layer. + activation_function (callable): The activation function to use after the convolutional layer. + include_dropout (bool): Whether to include a dropout layer for regularization. + """ + + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): + super(Decoder, self).__init__() + + # Initialize the convolutional layer with specified parameters. + self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + + # Initialize the dropout layer if include_dropout is set to True + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def __call__(self, input_tensor, skip=None): + # Upsample the input tensor to a higher resolution using bilinear interpolation. + input_tensor = F.interpolate(input_tensor, scale_factor=2, mode="bilinear", align_corners=True) + # If a skip connection is provided, crop it to match the size of input_tensor and concatenate them along the channel dimension. + if skip is not None: + skip = spec_utils.crop_center(skip, input_tensor) # Crop skip_connection to match input_tensor's dimensions. + input_tensor = torch.cat([input_tensor, skip], dim=1) # Concatenate input_tensor and skip_connection along the channel dimension. + + # Pass the concatenated tensor (or just input_tensor if no skip_connection is provided) through the convolutional layer. + output_tensor = self.conv(input_tensor) + + # If dropout is enabled, apply it to the output of the convolutional layer. + if self.dropout is not None: + output_tensor = self.dropout(output_tensor) + + # Return the final output tensor. + return output_tensor + + +class ASPPModule(nn.Module): + """ + Atrous Spatial Pyramid Pooling (ASPP) Module is designed for capturing multi-scale context by applying + atrous convolution at multiple rates. This is particularly useful in segmentation tasks where capturing + objects at various scales is beneficial. The module applies several parallel dilated convolutions with + different dilation rates to the input feature map, allowing it to efficiently capture information at + multiple scales. + + Attributes: + conv1 (nn.Sequential): Applies adaptive average pooling followed by a 1x1 convolution. + nn_architecture (int): Identifier for the neural network architecture being used. + six_layer (list): List containing architecture identifiers that require six layers. + seven_layer (list): List containing architecture identifiers that require seven layers. + conv2-conv7 (nn.Module): Convolutional layers with varying dilation rates for multi-scale feature extraction. + bottleneck (nn.Sequential): A 1x1 convolutional layer that combines all features followed by dropout for regularization. + """ + + def __init__(self, nn_architecture, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): + """ + Initializes the ASPP module with specified parameters. + + Args: + nn_architecture (int): Identifier for the neural network architecture. + input_channels (int): Number of input channels. + output_channels (int): Number of output channels. + dilations (tuple): Tuple of dilation rates for the atrous convolutions. + activation (callable): Activation function to use after convolutional layers. + """ + super(ASPPModule, self).__init__() + + # Adaptive average pooling reduces the spatial dimensions to 1x1, focusing on global context, + # followed by a 1x1 convolution to project back to the desired channel dimension. + self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, None)), Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)) + + self.nn_architecture = nn_architecture + # Architecture identifiers for models requiring additional layers. + self.six_layer = [129605] + self.seven_layer = [537238, 537227, 33966] + + # Extra convolutional layer used for six and seven layer configurations. + extra_conv = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + + # Standard 1x1 convolution for channel reduction. + self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + + # Separable convolutions with different dilation rates for multi-scale feature extraction. + self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + + # Depending on the architecture, include the extra convolutional layers. + if self.nn_architecture in self.six_layer: + self.conv6 = extra_conv + nin_x = 6 + elif self.nn_architecture in self.seven_layer: + self.conv6 = extra_conv + self.conv7 = extra_conv + nin_x = 7 + else: + nin_x = 5 + + # Bottleneck layer combines all the multi-scale features into the desired number of output channels. + self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * nin_x, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) + + def forward(self, input_tensor): + """ + Forward pass of the ASPP module. + + Args: + input_tensor (Tensor): Input tensor. + + Returns: + Tensor: Output tensor after applying ASPP. + """ + _, _, h, w = input_tensor.size() + + # Apply the first convolutional sequence and upsample to the original resolution. + feat1 = F.interpolate(self.conv1(input_tensor), size=(h, w), mode="bilinear", align_corners=True) + + # Apply the remaining convolutions directly on the input. + feat2 = self.conv2(input_tensor) + feat3 = self.conv3(input_tensor) + feat4 = self.conv4(input_tensor) + feat5 = self.conv5(input_tensor) + + # Concatenate features from all layers. Depending on the architecture, include the extra features. + if self.nn_architecture in self.six_layer: + feat6 = self.conv6(input_tensor) + out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6), dim=1) + elif self.nn_architecture in self.seven_layer: + feat6 = self.conv6(input_tensor) + feat7 = self.conv7(input_tensor) + out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) + else: + out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) + + # Apply the bottleneck layer to combine and reduce the channel dimensions. + bottleneck_output = self.bottleneck(out) + return bottleneck_output diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/layers_new.py b/audio_separator/separator/uvr_lib_v5/vr_network/layers_new.py new file mode 100644 index 0000000..56b7d45 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/layers_new.py @@ -0,0 +1,149 @@ +import torch +from torch import nn +import torch.nn.functional as F + +from audio_separator.separator.uvr_lib_v5 import spec_utils + + +class Conv2DBNActiv(nn.Module): + """ + Conv2DBNActiv Class: + This class implements a convolutional layer followed by batch normalization and an activation function. + It is a fundamental building block for constructing neural networks, especially useful in image and audio processing tasks. + The class encapsulates the pattern of applying a convolution, normalizing the output, and then applying a non-linear activation. + """ + + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(Conv2DBNActiv, self).__init__() + + # Sequential model combining Conv2D, BatchNorm, and activation function into a single module + self.conv = nn.Sequential(nn.Conv2d(nin, nout, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, bias=False), nn.BatchNorm2d(nout), activ()) + + def __call__(self, input_tensor): + # Forward pass through the sequential model + return self.conv(input_tensor) + + +class Encoder(nn.Module): + """ + Encoder Class: + This class defines an encoder module typically used in autoencoder architectures. + It consists of two convolutional layers, each followed by batch normalization and an activation function. + """ + + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): + super(Encoder, self).__init__() + + # First convolutional layer of the encoder + self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ) + # Second convolutional layer of the encoder + self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) + + def __call__(self, input_tensor): + # Applying the first and then the second convolutional layers + hidden = self.conv1(input_tensor) + hidden = self.conv2(hidden) + + return hidden + + +class Decoder(nn.Module): + """ + Decoder Class: + This class defines a decoder module, which is the counterpart of the Encoder class in autoencoder architectures. + It applies a convolutional layer followed by batch normalization and an activation function, with an optional dropout layer for regularization. + """ + + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): + super(Decoder, self).__init__() + # Convolutional layer with optional dropout for regularization + self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def __call__(self, input_tensor, skip=None): + # Forward pass through the convolutional layer and optional dropout + input_tensor = F.interpolate(input_tensor, scale_factor=2, mode="bilinear", align_corners=True) + + if skip is not None: + skip = spec_utils.crop_center(skip, input_tensor) + input_tensor = torch.cat([input_tensor, skip], dim=1) + + hidden = self.conv1(input_tensor) + # hidden = self.conv2(hidden) + + if self.dropout is not None: + hidden = self.dropout(hidden) + + return hidden + + +class ASPPModule(nn.Module): + """ + ASPPModule Class: + This class implements the Atrous Spatial Pyramid Pooling (ASPP) module, which is useful for semantic image segmentation tasks. + It captures multi-scale contextual information by applying convolutions at multiple dilation rates. + """ + + def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False): + super(ASPPModule, self).__init__() + + # Global context convolution captures the overall context + self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, None)), Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)) + self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ) + self.conv3 = Conv2DBNActiv(nin, nout, 3, 1, dilations[0], dilations[0], activ=activ) + self.conv4 = Conv2DBNActiv(nin, nout, 3, 1, dilations[1], dilations[1], activ=activ) + self.conv5 = Conv2DBNActiv(nin, nout, 3, 1, dilations[2], dilations[2], activ=activ) + self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def forward(self, input_tensor): + _, _, h, w = input_tensor.size() + + # Upsample global context to match input size and combine with local and multi-scale features + feat1 = F.interpolate(self.conv1(input_tensor), size=(h, w), mode="bilinear", align_corners=True) + feat2 = self.conv2(input_tensor) + feat3 = self.conv3(input_tensor) + feat4 = self.conv4(input_tensor) + feat5 = self.conv5(input_tensor) + out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) + out = self.bottleneck(out) + + if self.dropout is not None: + out = self.dropout(out) + + return out + + +class LSTMModule(nn.Module): + """ + LSTMModule Class: + This class defines a module that combines convolutional feature extraction with a bidirectional LSTM for sequence modeling. + It is useful for tasks that require understanding temporal dynamics in data, such as speech and audio processing. + """ + + def __init__(self, nin_conv, nin_lstm, nout_lstm): + super(LSTMModule, self).__init__() + # Convolutional layer for initial feature extraction + self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0) + + # Bidirectional LSTM for capturing temporal dynamics + self.lstm = nn.LSTM(input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True) + + # Dense layer for output dimensionality matching + self.dense = nn.Sequential(nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()) + + def forward(self, input_tensor): + N, _, nbins, nframes = input_tensor.size() + + # Extract features and prepare for LSTM + hidden = self.conv(input_tensor)[:, 0] # N, nbins, nframes + hidden = hidden.permute(2, 0, 1) # nframes, N, nbins + h, _ = self.lstm(h) + + # Apply dense layer and reshape to match expected output format + hidden = self.dense(h.reshape(-1, hidden.size()[-1])) # nframes * N, nbins + hidden = hidden.reshape(nframes, N, 1, nbins) + hidden = hidden.permute(1, 2, 3, 0) + + return hidden diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/model_param_init.py b/audio_separator/separator/uvr_lib_v5/vr_network/model_param_init.py new file mode 100644 index 0000000..8bba702 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/model_param_init.py @@ -0,0 +1,71 @@ +import json + +default_param = {} +default_param["bins"] = -1 +default_param["unstable_bins"] = -1 # training only +default_param["stable_bins"] = -1 # training only +default_param["sr"] = 44100 +default_param["pre_filter_start"] = -1 +default_param["pre_filter_stop"] = -1 +default_param["band"] = {} + +N_BINS = "n_bins" + + +def int_keys(d): + """ + Converts string keys that represent integers into actual integer keys in a list. + + This function is particularly useful when dealing with JSON data that may represent + integer keys as strings due to the nature of JSON encoding. By converting these keys + back to integers, it ensures that the data can be used in a manner consistent with + its original representation, especially in contexts where the distinction between + string and integer keys is important. + + Args: + input_list (list of tuples): A list of (key, value) pairs where keys are strings + that may represent integers. + + Returns: + dict: A dictionary with keys converted to integers where applicable. + """ + # Initialize an empty dictionary to hold the converted key-value pairs. + result_dict = {} + # Iterate through each key-value pair in the input list. + for key, value in d: + # Check if the key is a digit (i.e., represents an integer). + if key.isdigit(): + # Convert the key from a string to an integer. + key = int(key) + result_dict[key] = value + return result_dict + + +class ModelParameters(object): + """ + A class to manage model parameters, including loading from a configuration file. + + Attributes: + param (dict): Dictionary holding all parameters for the model. + """ + + def __init__(self, config_path=""): + """ + Initializes the ModelParameters object by loading parameters from a JSON configuration file. + + Args: + config_path (str): Path to the JSON configuration file. + """ + + # Load parameters from the given configuration file path. + with open(config_path, "r") as f: + self.param = json.loads(f.read(), object_pairs_hook=int_keys) + + # Ensure certain parameters are set to False if not specified in the configuration. + for k in ["mid_side", "mid_side_b", "mid_side_b2", "stereo_w", "stereo_n", "reverse"]: + if not k in self.param: + self.param[k] = False + + # If 'n_bins' is specified in the parameters, it's used as the value for 'bins'. + if N_BINS in self.param: + self.param["bins"] = self.param[N_BINS] diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr16000_hl512.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr16000_hl512.json new file mode 100644 index 0000000..72cb449 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr16000_hl512.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 16000, + "hl": 512, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 1024, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 16000, + "pre_filter_start": 1023, + "pre_filter_stop": 1024 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr32000_hl512.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr32000_hl512.json new file mode 100644 index 0000000..3c00ecf --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr32000_hl512.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 32000, + "hl": 512, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 1024, + "hpf_start": -1, + "res_type": "kaiser_fast" + } + }, + "sr": 32000, + "pre_filter_start": 1000, + "pre_filter_stop": 1021 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr33075_hl384.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr33075_hl384.json new file mode 100644 index 0000000..55666ac --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr33075_hl384.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 33075, + "hl": 384, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 1024, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 33075, + "pre_filter_start": 1000, + "pre_filter_stop": 1021 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl1024.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl1024.json new file mode 100644 index 0000000..665abe2 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl1024.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 44100, + "hl": 1024, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 1024, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 44100, + "pre_filter_start": 1023, + "pre_filter_stop": 1024 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl256.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl256.json new file mode 100644 index 0000000..0e8b16f --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl256.json @@ -0,0 +1,19 @@ +{ + "bins": 256, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 44100, + "hl": 256, + "n_fft": 512, + "crop_start": 0, + "crop_stop": 256, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 44100, + "pre_filter_start": 256, + "pre_filter_stop": 256 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512.json new file mode 100644 index 0000000..3b38fca --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 44100, + "hl": 512, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 1024, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 44100, + "pre_filter_start": 1023, + "pre_filter_stop": 1024 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512_cut.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512_cut.json new file mode 100644 index 0000000..630df35 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512_cut.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 44100, + "hl": 512, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 700, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 44100, + "pre_filter_start": 1023, + "pre_filter_stop": 700 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512_nf1024.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512_nf1024.json new file mode 100644 index 0000000..120ef1a --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/1band_sr44100_hl512_nf1024.json @@ -0,0 +1,19 @@ +{ + "bins": 512, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 44100, + "hl": 512, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 512, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 44100, + "pre_filter_start": 511, + "pre_filter_stop": 512 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_32000.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_32000.json new file mode 100644 index 0000000..ab9cf11 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_32000.json @@ -0,0 +1,30 @@ +{ + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 705, + "band": { + "1": { + "sr": 6000, + "hl": 66, + "n_fft": 512, + "crop_start": 0, + "crop_stop": 240, + "lpf_start": 60, + "lpf_stop": 118, + "res_type": "sinc_fastest" + }, + "2": { + "sr": 32000, + "hl": 352, + "n_fft": 1024, + "crop_start": 22, + "crop_stop": 505, + "hpf_start": 44, + "hpf_stop": 23, + "res_type": "sinc_medium" + } + }, + "sr": 32000, + "pre_filter_start": 710, + "pre_filter_stop": 731 +} diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_44100_lofi.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_44100_lofi.json new file mode 100644 index 0000000..7faa216 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_44100_lofi.json @@ -0,0 +1,30 @@ +{ + "bins": 512, + "unstable_bins": 7, + "reduction_bins": 510, + "band": { + "1": { + "sr": 11025, + "hl": 160, + "n_fft": 768, + "crop_start": 0, + "crop_stop": 192, + "lpf_start": 41, + "lpf_stop": 139, + "res_type": "sinc_fastest" + }, + "2": { + "sr": 44100, + "hl": 640, + "n_fft": 1024, + "crop_start": 10, + "crop_stop": 320, + "hpf_start": 47, + "hpf_stop": 15, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 510, + "pre_filter_stop": 512 +} diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_48000.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_48000.json new file mode 100644 index 0000000..be075f5 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/2band_48000.json @@ -0,0 +1,30 @@ +{ + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 705, + "band": { + "1": { + "sr": 6000, + "hl": 66, + "n_fft": 512, + "crop_start": 0, + "crop_stop": 240, + "lpf_start": 60, + "lpf_stop": 240, + "res_type": "sinc_fastest" + }, + "2": { + "sr": 48000, + "hl": 528, + "n_fft": 1536, + "crop_start": 22, + "crop_stop": 505, + "hpf_start": 82, + "hpf_stop": 22, + "res_type": "sinc_medium" + } + }, + "sr": 48000, + "pre_filter_start": 710, + "pre_filter_stop": 731 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100.json new file mode 100644 index 0000000..d99e239 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100.json @@ -0,0 +1,42 @@ +{ + "bins": 768, + "unstable_bins": 5, + "reduction_bins": 733, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 768, + "crop_start": 0, + "crop_stop": 278, + "lpf_start": 28, + "lpf_stop": 140, + "res_type": "polyphase" + }, + "2": { + "sr": 22050, + "hl": 256, + "n_fft": 768, + "crop_start": 14, + "crop_stop": 322, + "hpf_start": 70, + "hpf_stop": 14, + "lpf_start": 283, + "lpf_stop": 314, + "res_type": "polyphase" + }, + "3": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 131, + "crop_stop": 313, + "hpf_start": 154, + "hpf_stop": 141, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 757, + "pre_filter_stop": 768 +} diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100_mid.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100_mid.json new file mode 100644 index 0000000..fc2c487 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100_mid.json @@ -0,0 +1,43 @@ +{ + "mid_side": true, + "bins": 768, + "unstable_bins": 5, + "reduction_bins": 733, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 768, + "crop_start": 0, + "crop_stop": 278, + "lpf_start": 28, + "lpf_stop": 140, + "res_type": "polyphase" + }, + "2": { + "sr": 22050, + "hl": 256, + "n_fft": 768, + "crop_start": 14, + "crop_stop": 322, + "hpf_start": 70, + "hpf_stop": 14, + "lpf_start": 283, + "lpf_stop": 314, + "res_type": "polyphase" + }, + "3": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 131, + "crop_stop": 313, + "hpf_start": 154, + "hpf_stop": 141, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 757, + "pre_filter_stop": 768 +} diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100_msb2.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100_msb2.json new file mode 100644 index 0000000..33b0877 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/3band_44100_msb2.json @@ -0,0 +1,43 @@ +{ + "mid_side_b2": true, + "bins": 640, + "unstable_bins": 7, + "reduction_bins": 565, + "band": { + "1": { + "sr": 11025, + "hl": 108, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 187, + "lpf_start": 92, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "2": { + "sr": 22050, + "hl": 216, + "n_fft": 768, + "crop_start": 0, + "crop_stop": 212, + "hpf_start": 68, + "hpf_stop": 34, + "lpf_start": 174, + "lpf_stop": 209, + "res_type": "polyphase" + }, + "3": { + "sr": 44100, + "hl": 432, + "n_fft": 640, + "crop_start": 66, + "crop_stop": 307, + "hpf_start": 86, + "hpf_stop": 72, + "res_type": "kaiser_fast" + } + }, + "sr": 44100, + "pre_filter_start": 639, + "pre_filter_stop": 640 +} diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100.json new file mode 100644 index 0000000..4ae850a --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100.json @@ -0,0 +1,54 @@ +{ + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_mid.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_mid.json new file mode 100644 index 0000000..6346701 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_mid.json @@ -0,0 +1,55 @@ +{ + "bins": 768, + "unstable_bins": 7, + "mid_side": true, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_msb.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_msb.json new file mode 100644 index 0000000..0bf4771 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_msb.json @@ -0,0 +1,55 @@ +{ + "mid_side_b": true, + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_msb2.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_msb2.json new file mode 100644 index 0000000..0bf4771 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_msb2.json @@ -0,0 +1,55 @@ +{ + "mid_side_b": true, + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_reverse.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_reverse.json new file mode 100644 index 0000000..779a1c9 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_reverse.json @@ -0,0 +1,55 @@ +{ + "reverse": true, + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_sw.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_sw.json new file mode 100644 index 0000000..1fefd4a --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_44100_sw.json @@ -0,0 +1,55 @@ +{ + "stereo_w": true, + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v2.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v2.json new file mode 100644 index 0000000..af79810 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v2.json @@ -0,0 +1,54 @@ +{ + "bins": 672, + "unstable_bins": 8, + "reduction_bins": 637, + "band": { + "1": { + "sr": 7350, + "hl": 80, + "n_fft": 640, + "crop_start": 0, + "crop_stop": 85, + "lpf_start": 25, + "lpf_stop": 53, + "res_type": "polyphase" + }, + "2": { + "sr": 7350, + "hl": 80, + "n_fft": 320, + "crop_start": 4, + "crop_stop": 87, + "hpf_start": 25, + "hpf_stop": 12, + "lpf_start": 31, + "lpf_stop": 62, + "res_type": "polyphase" + }, + "3": { + "sr": 14700, + "hl": 160, + "n_fft": 512, + "crop_start": 17, + "crop_stop": 216, + "hpf_start": 48, + "hpf_stop": 24, + "lpf_start": 139, + "lpf_stop": 210, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 480, + "n_fft": 960, + "crop_start": 78, + "crop_stop": 383, + "hpf_start": 130, + "hpf_stop": 86, + "res_type": "kaiser_fast" + } + }, + "sr": 44100, + "pre_filter_start": 668, + "pre_filter_stop": 672 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v2_sn.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v2_sn.json new file mode 100644 index 0000000..319b998 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v2_sn.json @@ -0,0 +1,55 @@ +{ + "bins": 672, + "unstable_bins": 8, + "reduction_bins": 637, + "band": { + "1": { + "sr": 7350, + "hl": 80, + "n_fft": 640, + "crop_start": 0, + "crop_stop": 85, + "lpf_start": 25, + "lpf_stop": 53, + "res_type": "polyphase" + }, + "2": { + "sr": 7350, + "hl": 80, + "n_fft": 320, + "crop_start": 4, + "crop_stop": 87, + "hpf_start": 25, + "hpf_stop": 12, + "lpf_start": 31, + "lpf_stop": 62, + "res_type": "polyphase" + }, + "3": { + "sr": 14700, + "hl": 160, + "n_fft": 512, + "crop_start": 17, + "crop_stop": 216, + "hpf_start": 48, + "hpf_stop": 24, + "lpf_start": 139, + "lpf_stop": 210, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 480, + "n_fft": 960, + "crop_start": 78, + "crop_stop": 383, + "hpf_start": 130, + "hpf_stop": 86, + "convert_channels": "stereo_n", + "res_type": "kaiser_fast" + } + }, + "sr": 44100, + "pre_filter_start": 668, + "pre_filter_stop": 672 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v3.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v3.json new file mode 100644 index 0000000..2a73bc9 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v3.json @@ -0,0 +1,54 @@ +{ + "bins": 672, + "unstable_bins": 8, + "reduction_bins": 530, + "band": { + "1": { + "sr": 7350, + "hl": 80, + "n_fft": 640, + "crop_start": 0, + "crop_stop": 85, + "lpf_start": 25, + "lpf_stop": 53, + "res_type": "polyphase" + }, + "2": { + "sr": 7350, + "hl": 80, + "n_fft": 320, + "crop_start": 4, + "crop_stop": 87, + "hpf_start": 25, + "hpf_stop": 12, + "lpf_start": 31, + "lpf_stop": 62, + "res_type": "polyphase" + }, + "3": { + "sr": 14700, + "hl": 160, + "n_fft": 512, + "crop_start": 17, + "crop_stop": 216, + "hpf_start": 48, + "hpf_stop": 24, + "lpf_start": 139, + "lpf_stop": 210, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 480, + "n_fft": 960, + "crop_start": 78, + "crop_stop": 383, + "hpf_start": 130, + "hpf_stop": 86, + "res_type": "kaiser_fast" + } + }, + "sr": 44100, + "pre_filter_start": 668, + "pre_filter_stop": 672 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v3_sn.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v3_sn.json new file mode 100644 index 0000000..6680a06 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/4band_v3_sn.json @@ -0,0 +1,55 @@ +{ + "n_bins": 672, + "unstable_bins": 8, + "stable_bins": 530, + "band": { + "1": { + "sr": 7350, + "hl": 80, + "n_fft": 640, + "crop_start": 0, + "crop_stop": 85, + "lpf_start": 25, + "lpf_stop": 53, + "res_type": "polyphase" + }, + "2": { + "sr": 7350, + "hl": 80, + "n_fft": 320, + "crop_start": 4, + "crop_stop": 87, + "hpf_start": 25, + "hpf_stop": 12, + "lpf_start": 31, + "lpf_stop": 62, + "res_type": "polyphase" + }, + "3": { + "sr": 14700, + "hl": 160, + "n_fft": 512, + "crop_start": 17, + "crop_stop": 216, + "hpf_start": 48, + "hpf_stop": 24, + "lpf_start": 139, + "lpf_stop": 210, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 480, + "n_fft": 960, + "crop_start": 78, + "crop_stop": 383, + "hpf_start": 130, + "hpf_stop": 86, + "convert_channels": "stereo_n", + "res_type": "kaiser_fast" + } + }, + "sr": 44100, + "pre_filter_start": 668, + "pre_filter_stop": 672 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/ensemble.json b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/ensemble.json new file mode 100644 index 0000000..ca96bf1 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/modelparams/ensemble.json @@ -0,0 +1,43 @@ +{ + "mid_side_b2": true, + "bins": 1280, + "unstable_bins": 7, + "reduction_bins": 565, + "band": { + "1": { + "sr": 11025, + "hl": 108, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 374, + "lpf_start": 92, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "2": { + "sr": 22050, + "hl": 216, + "n_fft": 1536, + "crop_start": 0, + "crop_stop": 424, + "hpf_start": 68, + "hpf_stop": 34, + "lpf_start": 348, + "lpf_stop": 418, + "res_type": "polyphase" + }, + "3": { + "sr": 44100, + "hl": 432, + "n_fft": 1280, + "crop_start": 132, + "crop_stop": 614, + "hpf_start": 172, + "hpf_stop": 144, + "res_type": "polyphase" + } + }, + "sr": 44100, + "pre_filter_start": 1280, + "pre_filter_stop": 1280 +} \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/nets.py b/audio_separator/separator/uvr_lib_v5/vr_network/nets.py new file mode 100644 index 0000000..5e1cfbc --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/nets.py @@ -0,0 +1,175 @@ +import torch +from torch import nn +import torch.nn.functional as F + +from . import layers + + +class BaseASPPNet(nn.Module): + """ + BaseASPPNet Class: + This class defines the base architecture for an Atrous Spatial Pyramid Pooling (ASPP) network. + It is designed to extract features from input data at multiple scales by using dilated convolutions. + This is particularly useful for tasks that benefit from understanding context at different resolutions, + such as semantic segmentation. The network consists of a series of encoder layers for downsampling and feature extraction, + followed by an ASPP module for multi-scale feature extraction, and finally a series of decoder layers for upsampling. + """ + + def __init__(self, nn_architecture, nin, ch, dilations=(4, 8, 16)): + super(BaseASPPNet, self).__init__() + self.nn_architecture = nn_architecture + + # Encoder layers progressively increase the number of channels while reducing spatial dimensions. + self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) + self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) + self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) + self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) + + # Depending on the network architecture, an additional encoder layer and a specific ASPP module are initialized. + if self.nn_architecture == 129605: + self.enc5 = layers.Encoder(ch * 8, ch * 16, 3, 2, 1) + self.aspp = layers.ASPPModule(nn_architecture, ch * 16, ch * 32, dilations) + self.dec5 = layers.Decoder(ch * (16 + 32), ch * 16, 3, 1, 1) + else: + self.aspp = layers.ASPPModule(nn_architecture, ch * 8, ch * 16, dilations) + + # Decoder layers progressively decrease the number of channels while increasing spatial dimensions. + self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) + self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) + self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) + self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) + + def __call__(self, input_tensor): + # The input tensor is passed through a series of encoder layers. + hidden_state, encoder_output1 = self.enc1(input_tensor) + hidden_state, encoder_output2 = self.enc2(hidden_state) + hidden_state, encoder_output3 = self.enc3(hidden_state) + hidden_state, encoder_output4 = self.enc4(hidden_state) + + # Depending on the network architecture, the hidden state is processed by an additional encoder layer and the ASPP module. + if self.nn_architecture == 129605: + hidden_state, encoder_output5 = self.enc5(hidden_state) + hidden_state = self.aspp(hidden_state) + # The decoder layers use skip connections from the encoder layers for better feature integration. + hidden_state = self.dec5(hidden_state, encoder_output5) + else: + hidden_state = self.aspp(hidden_state) + + # The hidden state is further processed by the decoder layers, using skip connections for feature integration. + hidden_state = self.dec4(hidden_state, encoder_output4) + hidden_state = self.dec3(hidden_state, encoder_output3) + hidden_state = self.dec2(hidden_state, encoder_output2) + hidden_state = self.dec1(hidden_state, encoder_output1) + + return hidden_state + + +def determine_model_capacity(n_fft_bins, nn_architecture): + """ + The determine_model_capacity function is designed to select the appropriate model configuration + based on the frequency bins and network architecture. It maps specific architectures to predefined + model capacities, which dictate the structure and parameters of the CascadedASPPNet model. + """ + + # Predefined model architectures categorized by their precision level. + sp_model_arch = [31191, 33966, 129605] + hp_model_arch = [123821, 123812] + hp2_model_arch = [537238, 537227] + + # Mapping network architectures to their corresponding model capacity data. + if nn_architecture in sp_model_arch: + model_capacity_data = [(2, 16), (2, 16), (18, 8, 1, 1, 0), (8, 16), (34, 16, 1, 1, 0), (16, 32), (32, 2, 1), (16, 2, 1), (16, 2, 1)] + + if nn_architecture in hp_model_arch: + model_capacity_data = [(2, 32), (2, 32), (34, 16, 1, 1, 0), (16, 32), (66, 32, 1, 1, 0), (32, 64), (64, 2, 1), (32, 2, 1), (32, 2, 1)] + + if nn_architecture in hp2_model_arch: + model_capacity_data = [(2, 64), (2, 64), (66, 32, 1, 1, 0), (32, 64), (130, 64, 1, 1, 0), (64, 128), (128, 2, 1), (64, 2, 1), (64, 2, 1)] + + # Initializing the CascadedASPPNet model with the selected model capacity data. + cascaded = CascadedASPPNet + model = cascaded(n_fft_bins, model_capacity_data, nn_architecture) + + return model + + +class CascadedASPPNet(nn.Module): + """ + CascadedASPPNet Class: + This class implements a cascaded version of the ASPP network, designed for processing audio signals + for tasks such as vocal removal. It consists of multiple stages, each with its own ASPP network, + to process different frequency bands of the input signal. This allows the model to effectively + handle the full spectrum of audio frequencies by focusing on different frequency bands separately. + """ + + def __init__(self, n_fft, model_capacity_data, nn_architecture): + super(CascadedASPPNet, self).__init__() + # The first stage processes the low and high frequency bands separately. + self.stg1_low_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[0]) + self.stg1_high_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[1]) + + # Bridge layers connect different stages of the network. + self.stg2_bridge = layers.Conv2DBNActiv(*model_capacity_data[2]) + self.stg2_full_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[3]) + + self.stg3_bridge = layers.Conv2DBNActiv(*model_capacity_data[4]) + self.stg3_full_band_net = BaseASPPNet(nn_architecture, *model_capacity_data[5]) + + # Output layers for the final mask prediction and auxiliary outputs. + self.out = nn.Conv2d(*model_capacity_data[6], bias=False) + self.aux1_out = nn.Conv2d(*model_capacity_data[7], bias=False) + self.aux2_out = nn.Conv2d(*model_capacity_data[8], bias=False) + + # Parameters for handling the frequency bins of the input signal. + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + + self.offset = 128 + + def forward(self, input_tensor): + # The forward pass processes the input tensor through each stage of the network, + # combining the outputs of different frequency bands and stages to produce the final mask. + mix = input_tensor.detach() + input_tensor = input_tensor.clone() + + # Preparing the input tensor by selecting the mainput_tensorimum frequency bin. + input_tensor = input_tensor[:, :, : self.max_bin] + + # Processing the low and high frequency bands separately in the first stage. + bandwidth = input_tensor.size()[2] // 2 + aux1 = torch.cat([self.stg1_low_band_net(input_tensor[:, :, :bandwidth]), self.stg1_high_band_net(input_tensor[:, :, bandwidth:])], dim=2) + + # Combining the outputs of the first stage and passing through the second stage. + hidden_state = torch.cat([input_tensor, aux1], dim=1) + aux2 = self.stg2_full_band_net(self.stg2_bridge(hidden_state)) + + # Further processing the combined outputs through the third stage. + hidden_state = torch.cat([input_tensor, aux1, aux2], dim=1) + hidden_state = self.stg3_full_band_net(self.stg3_bridge(hidden_state)) + + # Applying the final output layer to produce the mask. + mask = torch.sigmoid(self.out(hidden_state)) + + # Padding the mask to match the output frequency bin size. + mask = F.pad(input=mask, pad=(0, 0, 0, self.output_bin - mask.size()[2]), mode="replicate") + + # During training, auxiliary outputs are also produced and padded accordingly. + if self.training: + aux1 = torch.sigmoid(self.aux1_out(aux1)) + aux1 = F.pad(input=aux1, pad=(0, 0, 0, self.output_bin - aux1.size()[2]), mode="replicate") + aux2 = torch.sigmoid(self.aux2_out(aux2)) + aux2 = F.pad(input=aux2, pad=(0, 0, 0, self.output_bin - aux2.size()[2]), mode="replicate") + return mask * mix, aux1 * mix, aux2 * mix + else: + return mask # * mix + + def predict_mask(self, input_tensor): + # This method predicts the mask for the input tensor by calling the forward method + # and applying any necessary padding adjustments. + mask = self.forward(input_tensor) + + # Adjusting the mask by removing padding offsets if present. + if self.offset > 0: + mask = mask[:, :, :, self.offset : -self.offset] + + return mask diff --git a/audio_separator/separator/uvr_lib_v5/vr_network/nets_new.py b/audio_separator/separator/uvr_lib_v5/vr_network/nets_new.py new file mode 100644 index 0000000..f49065f --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/vr_network/nets_new.py @@ -0,0 +1,160 @@ +import torch +from torch import nn +import torch.nn.functional as F +from . import layers_new as layers + + +class BaseNet(nn.Module): + """ + BaseNet Class: + This class defines the base network architecture for vocal removal. It includes a series of encoders for feature extraction, + an ASPP module for capturing multi-scale context, and a series of decoders for reconstructing the output. Additionally, + it incorporates an LSTM module for capturing temporal dependencies. + """ + + def __init__(self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))): + super(BaseNet, self).__init__() + # Initialize the encoder layers with increasing output channels for hierarchical feature extraction. + self.enc1 = layers.Conv2DBNActiv(nin, nout, 3, 1, 1) + self.enc2 = layers.Encoder(nout, nout * 2, 3, 2, 1) + self.enc3 = layers.Encoder(nout * 2, nout * 4, 3, 2, 1) + self.enc4 = layers.Encoder(nout * 4, nout * 6, 3, 2, 1) + self.enc5 = layers.Encoder(nout * 6, nout * 8, 3, 2, 1) + + # ASPP module for capturing multi-scale features with different dilation rates. + self.aspp = layers.ASPPModule(nout * 8, nout * 8, dilations, dropout=True) + + # Decoder layers for upscaling and merging features from different levels of the encoder and ASPP module. + self.dec4 = layers.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1) + self.dec3 = layers.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1) + self.dec2 = layers.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1) + + # LSTM module for capturing temporal dependencies in the sequence of features. + self.lstm_dec2 = layers.LSTMModule(nout * 2, nin_lstm, nout_lstm) + self.dec1 = layers.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1) + + def __call__(self, input_tensor): + # Sequentially pass the input through the encoder layers. + encoded1 = self.enc1(input_tensor) + encoded2 = self.enc2(encoded1) + encoded3 = self.enc3(encoded2) + encoded4 = self.enc4(encoded3) + encoded5 = self.enc5(encoded4) + + # Pass the deepest encoder output through the ASPP module. + bottleneck = self.aspp(encoded5) + + # Sequentially upscale and merge the features using the decoder layers. + bottleneck = self.dec4(bottleneck, encoded4) + bottleneck = self.dec3(bottleneck, encoded3) + bottleneck = self.dec2(bottleneck, encoded2) + # Concatenate the LSTM module output for temporal feature enhancement. + bottleneck = torch.cat([bottleneck, self.lstm_dec2(bottleneck)], dim=1) + bottleneck = self.dec1(bottleneck, encoded1) + + return bottleneck + + +class CascadedNet(nn.Module): + """ + CascadedNet Class: + This class defines a cascaded network architecture that processes input in multiple stages, each stage focusing on different frequency bands. + It utilizes the BaseNet for processing, and combines outputs from different stages to produce the final mask for vocal removal. + """ + + def __init__(self, n_fft, nn_arch_size=51000, nout=32, nout_lstm=128): + super(CascadedNet, self).__init__() + # Calculate frequency bins based on FFT size. + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + self.nin_lstm = self.max_bin // 2 + self.offset = 64 + # Adjust output channels based on the architecture size. + nout = 64 if nn_arch_size == 218409 else nout + + # print(nout, nout_lstm, n_fft) + + # Initialize the network stages, each focusing on different frequency bands and progressively refining the output. + self.stg1_low_band_net = nn.Sequential(BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm), layers.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0)) + self.stg1_high_band_net = BaseNet(2, nout // 4, self.nin_lstm // 2, nout_lstm // 2) + + self.stg2_low_band_net = nn.Sequential(BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm), layers.Conv2DBNActiv(nout, nout // 2, 1, 1, 0)) + self.stg2_high_band_net = BaseNet(nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2) + + self.stg3_full_band_net = BaseNet(3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm) + + # Output layer for generating the final mask. + self.out = nn.Conv2d(nout, 2, 1, bias=False) + # Auxiliary output layer for intermediate supervision during training. + self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False) + + def forward(self, input_tensor): + # Preprocess input tensor to match the maximum frequency bin. + input_tensor = input_tensor[:, :, : self.max_bin] + + # Split the input into low and high frequency bands. + bandw = input_tensor.size()[2] // 2 + l1_in = input_tensor[:, :, :bandw] + h1_in = input_tensor[:, :, bandw:] + + # Process each band through the first stage networks. + l1 = self.stg1_low_band_net(l1_in) + h1 = self.stg1_high_band_net(h1_in) + + # Combine the outputs for auxiliary supervision. + aux1 = torch.cat([l1, h1], dim=2) + + # Prepare inputs for the second stage by concatenating the original and processed bands. + l2_in = torch.cat([l1_in, l1], dim=1) + h2_in = torch.cat([h1_in, h1], dim=1) + + # Process through the second stage networks. + l2 = self.stg2_low_band_net(l2_in) + h2 = self.stg2_high_band_net(h2_in) + + # Combine the outputs for auxiliary supervision. + aux2 = torch.cat([l2, h2], dim=2) + + # Prepare input for the third stage by concatenating all previous outputs with the original input. + f3_in = torch.cat([x, aux1, aux2], dim=1) + + # Process through the third stage network. + f3 = self.stg3_full_band_net(f3_in) + + # Apply the output layer to generate the final mask and apply sigmoid for normalization. + mask = torch.sigmoid(self.out(f3)) + + # Pad the mask to match the output frequency bin size. + mask = F.pad(input=mask, pad=(0, 0, 0, self.output_bin - mask.size()[2]), mode="replicate") + + # During training, generate and pad the auxiliary output for additional supervision. + if self.training: + aux = torch.cat([aux1, aux2], dim=1) + aux = torch.sigmoid(self.aux_out(aux)) + aux = F.pad(input=aux, pad=(0, 0, 0, self.output_bin - aux.size()[2]), mode="replicate") + return mask, aux + else: + return mask + + # Method for predicting the mask given an input tensor. + def predict_mask(self, input_tensor): + mask = self.forward(input_tensor) + + # If an offset is specified, crop the mask to remove edge artifacts. + if self.offset > 0: + mask = mask[:, :, :, self.offset : -self.offset] + assert mask.size()[3] > 0 + + return mask + + # Method for applying the predicted mask to the input tensor to obtain the predicted magnitude. + def predict(self, input_tensor): + mask = self.forward(input_tensor) + pred_mag = input_tensor * mask + + # If an offset is specified, crop the predicted magnitude to remove edge artifacts. + if self.offset > 0: + pred_mag = pred_mag[:, :, :, self.offset : -self.offset] + assert pred_mag.size()[3] > 0 + + return pred_mag diff --git a/audio_separator/utils/cli.py b/audio_separator/utils/cli.py index 060e7cc..c8b4a35 100755 --- a/audio_separator/utils/cli.py +++ b/audio_separator/utils/cli.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import argparse import logging +import json from importlib import metadata @@ -11,106 +12,59 @@ def main(): log_handler.setFormatter(log_formatter) logger.addHandler(log_handler) - parser = argparse.ArgumentParser( - description="Separate audio file into different stems.", - formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, max_help_position=45), - ) + parser = argparse.ArgumentParser(description="Separate audio file into different stems.", formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, max_help_position=45)) parser.add_argument("audio_file", nargs="?", help="The audio file path to separate, in any common format.", default=argparse.SUPPRESS) package_version = metadata.distribution("audio-separator").version parser.add_argument("-v", "--version", action="version", version=f"%(prog)s {package_version}") - parser.add_argument( - "--log_level", - default="info", - help="Optional: logging level, e.g. info, debug, warning (default: %(default)s). Example: --log_level=debug", - ) + parser.add_argument("--log_level", default="info", help="Optional: logging level, e.g. info, debug, warning (default: %(default)s). Example: --log_level=debug") - parser.add_argument( - "--model_name", - default="UVR-MDX-NET-Inst_HQ_3", - help="Optional: model name to be used for separation (default: %(default)s). Example: --model_name=UVR_MDXNET_KARA_2", - ) + parser.add_argument("--list_models", action="store_true", help="List all supported models and exit.") parser.add_argument( - "--model_file_dir", - default="/tmp/audio-separator-models/", - help="Optional: model files directory (default: %(default)s). Example: --model_file_dir=/app/models", + "--model_filename", default="2_HP-UVR.pth", help="Optional: model filename to be used for separation (default: %(default)s). Example: --model_filename=UVR_MDXNET_KARA_2.onnx" ) - parser.add_argument( - "--output_dir", - default=None, - help="Optional: directory to write output files (default: ). Example: --output_dir=/app/separated", - ) + parser.add_argument("--model_file_dir", default="/tmp/audio-separator-models/", help="Optional: model files directory (default: %(default)s). Example: --model_file_dir=/app/models") - parser.add_argument( - "--output_format", - default="FLAC", - help="Optional: output format for separated files, any common format (default: %(default)s). Example: --output_format=MP3", - ) + parser.add_argument("--output_dir", default=None, help="Optional: directory to write output files (default: ). Example: --output_dir=/app/separated") - parser.add_argument( - "--denoise", - type=lambda x: (str(x).lower() == "true"), - default=False, - help="Optional: enable or disable denoising during separation (default: %(default)s). Example: --denoise=True", - ) + parser.add_argument("--output_format", default="FLAC", help="Optional: output format for separated files, any common format (default: %(default)s). Example: --output_format=MP3") parser.add_argument( - "--normalization_threshold", - type=float, - default=0.9, - help="Optional: max peak amplitude to normalize input and output audio to (default: %(default)s). Example: --normalization_threshold=0.7", + "--denoise", type=lambda x: (str(x).lower() == "true"), default=False, help="Optional: enable or disable denoising during separation (default: %(default)s). Example: --denoise=True" ) parser.add_argument( - "--single_stem", - default=None, - help="Optional: output only single stem, either instrumental or vocals. Example: --single_stem=instrumental", + "--normalization_threshold", type=float, default=0.9, help="Optional: max peak amplitude to normalize input and output audio to (default: %(default)s). Example: --normalization_threshold=0.7" ) - parser.add_argument( - "--invert_spect", - type=lambda x: (str(x).lower() == "true"), - default=False, - help="Optional: invert secondary stem using spectogram (default: %(default)s). Example: --invert_spect=True", - ) + parser.add_argument("--single_stem", default=None, help="Optional: output only single stem, either instrumental or vocals. Example: --single_stem=instrumental") parser.add_argument( - "--sample_rate", - type=int, - default=44100, - help="Optional: sample_rate (default: %(default)s). Example: --sample_rate=44100", + "--invert_spect", type=lambda x: (str(x).lower() == "true"), default=False, help="Optional: invert secondary stem using spectogram (default: %(default)s). Example: --invert_spect=True" ) - parser.add_argument( - "--hop_length", - type=int, - default=1024, - help="Optional: hop_length (default: %(default)s). Example: --hop_length=1024", - ) + parser.add_argument("--sample_rate", type=int, default=44100, help="Optional: sample_rate (default: %(default)s). Example: --sample_rate=44100") - parser.add_argument( - "--segment_size", - type=int, - default=256, - help="Optional: segment_size (default: %(default)s). Example: --segment_size=256", - ) + parser.add_argument("--mdx_hop_length", type=int, default=1024, help="Optional: mdx_hop_length (default: %(default)s). Example: --mdx_hop_length=1024") + parser.add_argument("--mdx_segment_size", type=int, default=256, help="Optional: mdx_segment_size (default: %(default)s). Example: --mdx_segment_size=256") + parser.add_argument("--mdx_overlap", type=float, default=0.25, help="Optional: mdx_overlap (default: %(default)s). Example: --mdx_overlap=0.25") + parser.add_argument("--mdx_batch_size", type=int, default=1, help="Optional: mdx_batch_size (default: %(default)s). Example: --mdx_batch_size=4") + + parser.add_argument("--vr_batch_size", type=int, default=4, help="Optional: vr_batch_size (default: %(default)s). Example: --vr_batch_size=16") + parser.add_argument("--vr_window_size", type=int, default=512, help="Optional: vr_window_size (default: %(default)s). Example: --vr_window_size=256") + parser.add_argument("--vr_aggression", type=int, default=5, help="Optional: vr_aggression (default: %(default)s). Example: --vr_aggression=2") + parser.add_argument("--vr_enable_tta", type=lambda x: (str(x).lower() == "true"), default=False, help="Optional: vr_enable_tta (default: %(default)s). Example: --vr_enable_tta=True") parser.add_argument( - "--overlap", - type=float, - default=0.25, - help="Optional: overlap (default: %(default)s). Example: --overlap=0.25", + "--vr_enable_post_process", type=lambda x: (str(x).lower() == "true"), default=False, help="Optional: vr_enable_post_process (default: %(default)s). Example: --vr_enable_post_process=True" ) - + parser.add_argument("--vr_post_process_threshold", type=float, default=0.2, help="Optional: vr_post_process_threshold (default: %(default)s). Example: --vr_post_process_threshold=0.1") parser.add_argument( - "--batch_size", - type=int, - default=1, - help="Optional: batch_size (default: %(default)s). Example: --batch_size=1", + "--vr_high_end_process", type=lambda x: (str(x).lower() == "true"), default=False, help="Optional: vr_high_end_process (default: %(default)s). Example: --vr_high_end_process=True" ) args = parser.parse_args() @@ -118,6 +72,13 @@ def main(): log_level = getattr(logging, args.log_level.upper()) logger.setLevel(log_level) + if args.list_models: + from audio_separator.separator import Separator + + separator = Separator() + print(json.dumps(separator.list_supported_model_files(), indent=4, sort_keys=True)) + exit(0) + if not hasattr(args, "audio_file"): parser.print_help() exit(1) @@ -133,18 +94,24 @@ def main(): model_file_dir=args.model_file_dir, output_dir=args.output_dir, output_format=args.output_format, - denoise_enabled=args.denoise, + enable_denoise=args.denoise, normalization_threshold=args.normalization_threshold, output_single_stem=args.single_stem, invert_using_spec=args.invert_spect, sample_rate=args.sample_rate, - hop_length=args.hop_length, - segment_size=args.segment_size, - overlap=args.overlap, - batch_size=args.batch_size, - ) - - separator.load_model(args.model_name) + mdx_params={"hop_length": args.mdx_hop_length, "segment_size": args.mdx_segment_size, "overlap": args.mdx_overlap, "batch_size": args.mdx_batch_size}, + vr_params={ + "batch_size": args.vr_batch_size, + "window_size": args.vr_window_size, + "aggression": args.vr_aggression, + "enable_tta": args.vr_enable_tta, + "enable_post_process": args.vr_enable_post_process, + "post_process_threshold": args.vr_post_process_threshold, + "high_end_process": args.vr_high_end_process, + }, + ) + + separator.load_model(args.model_filename) output_files = separator.separate(args.audio_file) diff --git a/poetry.lock b/poetry.lock index 30be610..ed0356b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -62,13 +62,13 @@ uvloop = ["uvloop (>=0.15.2)"] [[package]] name = "certifi" -version = "2023.11.17" +version = "2024.2.2" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2023.11.17-py3-none-any.whl", hash = "sha256:e036ab49d5b79556f99cfc2d9320b34cfbe5be05c5871b51de9329f0603b0474"}, - {file = "certifi-2023.11.17.tar.gz", hash = "sha256:9b469f3a900bf28dc19b8cfbf8019bf47f7fdd1a65a1d4ffb98fc14166beb4d1"}, + {file = "certifi-2024.2.2-py3-none-any.whl", hash = "sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1"}, + {file = "certifi-2024.2.2.tar.gz", hash = "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f"}, ] [[package]] @@ -542,104 +542,101 @@ tests = ["matplotlib (>=3.3.0)", "packaging (>=20.0)", "pytest", "pytest-cov", " [[package]] name = "llvmlite" -version = "0.41.1" +version = "0.42.0" description = "lightweight wrapper around basic LLVM functionality" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "llvmlite-0.41.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c1e1029d47ee66d3a0c4d6088641882f75b93db82bd0e6178f7bd744ebce42b9"}, - {file = "llvmlite-0.41.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:150d0bc275a8ac664a705135e639178883293cf08c1a38de3bbaa2f693a0a867"}, - {file = "llvmlite-0.41.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1eee5cf17ec2b4198b509272cf300ee6577229d237c98cc6e63861b08463ddc6"}, - {file = "llvmlite-0.41.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dd0338da625346538f1173a17cabf21d1e315cf387ca21b294ff209d176e244"}, - {file = "llvmlite-0.41.1-cp310-cp310-win32.whl", hash = "sha256:fa1469901a2e100c17eb8fe2678e34bd4255a3576d1a543421356e9c14d6e2ae"}, - {file = "llvmlite-0.41.1-cp310-cp310-win_amd64.whl", hash = "sha256:2b76acee82ea0e9304be6be9d4b3840208d050ea0dcad75b1635fa06e949a0ae"}, - {file = "llvmlite-0.41.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:210e458723436b2469d61b54b453474e09e12a94453c97ea3fbb0742ba5a83d8"}, - {file = "llvmlite-0.41.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:855f280e781d49e0640aef4c4af586831ade8f1a6c4df483fb901cbe1a48d127"}, - {file = "llvmlite-0.41.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b67340c62c93a11fae482910dc29163a50dff3dfa88bc874872d28ee604a83be"}, - {file = "llvmlite-0.41.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2181bb63ef3c607e6403813421b46982c3ac6bfc1f11fa16a13eaafb46f578e6"}, - {file = "llvmlite-0.41.1-cp311-cp311-win_amd64.whl", hash = "sha256:9564c19b31a0434f01d2025b06b44c7ed422f51e719ab5d24ff03b7560066c9a"}, - {file = "llvmlite-0.41.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5940bc901fb0325970415dbede82c0b7f3e35c2d5fd1d5e0047134c2c46b3281"}, - {file = "llvmlite-0.41.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8b0a9a47c28f67a269bb62f6256e63cef28d3c5f13cbae4fab587c3ad506778b"}, - {file = "llvmlite-0.41.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8afdfa6da33f0b4226af8e64cfc2b28986e005528fbf944d0a24a72acfc9432"}, - {file = "llvmlite-0.41.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8454c1133ef701e8c050a59edd85d238ee18bb9a0eb95faf2fca8b909ee3c89a"}, - {file = "llvmlite-0.41.1-cp38-cp38-win32.whl", hash = "sha256:2d92c51e6e9394d503033ffe3292f5bef1566ab73029ec853861f60ad5c925d0"}, - {file = "llvmlite-0.41.1-cp38-cp38-win_amd64.whl", hash = "sha256:df75594e5a4702b032684d5481db3af990b69c249ccb1d32687b8501f0689432"}, - {file = "llvmlite-0.41.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:04725975e5b2af416d685ea0769f4ecc33f97be541e301054c9f741003085802"}, - {file = "llvmlite-0.41.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bf14aa0eb22b58c231243dccf7e7f42f7beec48970f2549b3a6acc737d1a4ba4"}, - {file = "llvmlite-0.41.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92c32356f669e036eb01016e883b22add883c60739bc1ebee3a1cc0249a50828"}, - {file = "llvmlite-0.41.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24091a6b31242bcdd56ae2dbea40007f462260bc9bdf947953acc39dffd54f8f"}, - {file = "llvmlite-0.41.1-cp39-cp39-win32.whl", hash = "sha256:880cb57ca49e862e1cd077104375b9d1dfdc0622596dfa22105f470d7bacb309"}, - {file = "llvmlite-0.41.1-cp39-cp39-win_amd64.whl", hash = "sha256:92f093986ab92e71c9ffe334c002f96defc7986efda18397d0f08534f3ebdc4d"}, - {file = "llvmlite-0.41.1.tar.gz", hash = "sha256:f19f767a018e6ec89608e1f6b13348fa2fcde657151137cb64e56d48598a92db"}, + {file = "llvmlite-0.42.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3366938e1bf63d26c34fbfb4c8e8d2ded57d11e0567d5bb243d89aab1eb56098"}, + {file = "llvmlite-0.42.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c35da49666a21185d21b551fc3caf46a935d54d66969d32d72af109b5e7d2b6f"}, + {file = "llvmlite-0.42.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70f44ccc3c6220bd23e0ba698a63ec2a7d3205da0d848804807f37fc243e3f77"}, + {file = "llvmlite-0.42.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:763f8d8717a9073b9e0246998de89929071d15b47f254c10eef2310b9aac033d"}, + {file = "llvmlite-0.42.0-cp310-cp310-win_amd64.whl", hash = "sha256:8d90edf400b4ceb3a0e776b6c6e4656d05c7187c439587e06f86afceb66d2be5"}, + {file = "llvmlite-0.42.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ae511caed28beaf1252dbaf5f40e663f533b79ceb408c874c01754cafabb9cbf"}, + {file = "llvmlite-0.42.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81e674c2fe85576e6c4474e8c7e7aba7901ac0196e864fe7985492b737dbab65"}, + {file = "llvmlite-0.42.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb3975787f13eb97629052edb5017f6c170eebc1c14a0433e8089e5db43bcce6"}, + {file = "llvmlite-0.42.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5bece0cdf77f22379f19b1959ccd7aee518afa4afbd3656c6365865f84903f9"}, + {file = "llvmlite-0.42.0-cp311-cp311-win_amd64.whl", hash = "sha256:7e0c4c11c8c2aa9b0701f91b799cb9134a6a6de51444eff5a9087fc7c1384275"}, + {file = "llvmlite-0.42.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:08fa9ab02b0d0179c688a4216b8939138266519aaa0aa94f1195a8542faedb56"}, + {file = "llvmlite-0.42.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b2fce7d355068494d1e42202c7aff25d50c462584233013eb4470c33b995e3ee"}, + {file = "llvmlite-0.42.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ebe66a86dc44634b59a3bc860c7b20d26d9aaffcd30364ebe8ba79161a9121f4"}, + {file = "llvmlite-0.42.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47494552559e00d81bfb836cf1c4d5a5062e54102cc5767d5aa1e77ccd2505c"}, + {file = "llvmlite-0.42.0-cp312-cp312-win_amd64.whl", hash = "sha256:05cb7e9b6ce69165ce4d1b994fbdedca0c62492e537b0cc86141b6e2c78d5888"}, + {file = "llvmlite-0.42.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bdd3888544538a94d7ec99e7c62a0cdd8833609c85f0c23fcb6c5c591aec60ad"}, + {file = "llvmlite-0.42.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0936c2067a67fb8816c908d5457d63eba3e2b17e515c5fe00e5ee2bace06040"}, + {file = "llvmlite-0.42.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a78ab89f1924fc11482209f6799a7a3fc74ddc80425a7a3e0e8174af0e9e2301"}, + {file = "llvmlite-0.42.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7599b65c7af7abbc978dbf345712c60fd596aa5670496561cc10e8a71cebfb2"}, + {file = "llvmlite-0.42.0-cp39-cp39-win_amd64.whl", hash = "sha256:43d65cc4e206c2e902c1004dd5418417c4efa6c1d04df05c6c5675a27e8ca90e"}, + {file = "llvmlite-0.42.0.tar.gz", hash = "sha256:f92b09243c0cc3f457da8b983f67bd8e1295d0f5b3746c7a1861d7a99403854a"}, ] [[package]] name = "markupsafe" -version = "2.1.4" +version = "2.1.5" description = "Safely add untrusted strings to HTML/XML markup." optional = false python-versions = ">=3.7" files = [ - {file = "MarkupSafe-2.1.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:de8153a7aae3835484ac168a9a9bdaa0c5eee4e0bc595503c95d53b942879c84"}, - {file = "MarkupSafe-2.1.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e888ff76ceb39601c59e219f281466c6d7e66bd375b4ec1ce83bcdc68306796b"}, - {file = "MarkupSafe-2.1.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0b838c37ba596fcbfca71651a104a611543077156cb0a26fe0c475e1f152ee8"}, - {file = "MarkupSafe-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dac1ebf6983148b45b5fa48593950f90ed6d1d26300604f321c74a9ca1609f8e"}, - {file = "MarkupSafe-2.1.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0fbad3d346df8f9d72622ac71b69565e621ada2ce6572f37c2eae8dacd60385d"}, - {file = "MarkupSafe-2.1.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d5291d98cd3ad9a562883468c690a2a238c4a6388ab3bd155b0c75dd55ece858"}, - {file = "MarkupSafe-2.1.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a7cc49ef48a3c7a0005a949f3c04f8baa5409d3f663a1b36f0eba9bfe2a0396e"}, - {file = "MarkupSafe-2.1.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b83041cda633871572f0d3c41dddd5582ad7d22f65a72eacd8d3d6d00291df26"}, - {file = "MarkupSafe-2.1.4-cp310-cp310-win32.whl", hash = "sha256:0c26f67b3fe27302d3a412b85ef696792c4a2386293c53ba683a89562f9399b0"}, - {file = "MarkupSafe-2.1.4-cp310-cp310-win_amd64.whl", hash = "sha256:a76055d5cb1c23485d7ddae533229039b850db711c554a12ea64a0fd8a0129e2"}, - {file = "MarkupSafe-2.1.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9e9e3c4020aa2dc62d5dd6743a69e399ce3de58320522948af6140ac959ab863"}, - {file = "MarkupSafe-2.1.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0042d6a9880b38e1dd9ff83146cc3c9c18a059b9360ceae207805567aacccc69"}, - {file = "MarkupSafe-2.1.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55d03fea4c4e9fd0ad75dc2e7e2b6757b80c152c032ea1d1de487461d8140efc"}, - {file = "MarkupSafe-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ab3a886a237f6e9c9f4f7d272067e712cdb4efa774bef494dccad08f39d8ae6"}, - {file = "MarkupSafe-2.1.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abf5ebbec056817057bfafc0445916bb688a255a5146f900445d081db08cbabb"}, - {file = "MarkupSafe-2.1.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e1a0d1924a5013d4f294087e00024ad25668234569289650929ab871231668e7"}, - {file = "MarkupSafe-2.1.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:e7902211afd0af05fbadcc9a312e4cf10f27b779cf1323e78d52377ae4b72bea"}, - {file = "MarkupSafe-2.1.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c669391319973e49a7c6230c218a1e3044710bc1ce4c8e6eb71f7e6d43a2c131"}, - {file = "MarkupSafe-2.1.4-cp311-cp311-win32.whl", hash = "sha256:31f57d64c336b8ccb1966d156932f3daa4fee74176b0fdc48ef580be774aae74"}, - {file = "MarkupSafe-2.1.4-cp311-cp311-win_amd64.whl", hash = "sha256:54a7e1380dfece8847c71bf7e33da5d084e9b889c75eca19100ef98027bd9f56"}, - {file = "MarkupSafe-2.1.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:a76cd37d229fc385738bd1ce4cba2a121cf26b53864c1772694ad0ad348e509e"}, - {file = "MarkupSafe-2.1.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:987d13fe1d23e12a66ca2073b8d2e2a75cec2ecb8eab43ff5624ba0ad42764bc"}, - {file = "MarkupSafe-2.1.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5244324676254697fe5c181fc762284e2c5fceeb1c4e3e7f6aca2b6f107e60dc"}, - {file = "MarkupSafe-2.1.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78bc995e004681246e85e28e068111a4c3f35f34e6c62da1471e844ee1446250"}, - {file = "MarkupSafe-2.1.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a4d176cfdfde84f732c4a53109b293d05883e952bbba68b857ae446fa3119b4f"}, - {file = "MarkupSafe-2.1.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f9917691f410a2e0897d1ef99619fd3f7dd503647c8ff2475bf90c3cf222ad74"}, - {file = "MarkupSafe-2.1.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:f06e5a9e99b7df44640767842f414ed5d7bedaaa78cd817ce04bbd6fd86e2dd6"}, - {file = "MarkupSafe-2.1.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:396549cea79e8ca4ba65525470d534e8a41070e6b3500ce2414921099cb73e8d"}, - {file = "MarkupSafe-2.1.4-cp312-cp312-win32.whl", hash = "sha256:f6be2d708a9d0e9b0054856f07ac7070fbe1754be40ca8525d5adccdbda8f475"}, - {file = "MarkupSafe-2.1.4-cp312-cp312-win_amd64.whl", hash = "sha256:5045e892cfdaecc5b4c01822f353cf2c8feb88a6ec1c0adef2a2e705eef0f656"}, - {file = "MarkupSafe-2.1.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7a07f40ef8f0fbc5ef1000d0c78771f4d5ca03b4953fc162749772916b298fc4"}, - {file = "MarkupSafe-2.1.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d18b66fe626ac412d96c2ab536306c736c66cf2a31c243a45025156cc190dc8a"}, - {file = "MarkupSafe-2.1.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:698e84142f3f884114ea8cf83e7a67ca8f4ace8454e78fe960646c6c91c63bfa"}, - {file = "MarkupSafe-2.1.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:49a3b78a5af63ec10d8604180380c13dcd870aba7928c1fe04e881d5c792dc4e"}, - {file = "MarkupSafe-2.1.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:15866d7f2dc60cfdde12ebb4e75e41be862348b4728300c36cdf405e258415ec"}, - {file = "MarkupSafe-2.1.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:6aa5e2e7fc9bc042ae82d8b79d795b9a62bd8f15ba1e7594e3db243f158b5565"}, - {file = "MarkupSafe-2.1.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:54635102ba3cf5da26eb6f96c4b8c53af8a9c0d97b64bdcb592596a6255d8518"}, - {file = "MarkupSafe-2.1.4-cp37-cp37m-win32.whl", hash = "sha256:3583a3a3ab7958e354dc1d25be74aee6228938312ee875a22330c4dc2e41beb0"}, - {file = "MarkupSafe-2.1.4-cp37-cp37m-win_amd64.whl", hash = "sha256:d6e427c7378c7f1b2bef6a344c925b8b63623d3321c09a237b7cc0e77dd98ceb"}, - {file = "MarkupSafe-2.1.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:bf1196dcc239e608605b716e7b166eb5faf4bc192f8a44b81e85251e62584bd2"}, - {file = "MarkupSafe-2.1.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4df98d4a9cd6a88d6a585852f56f2155c9cdb6aec78361a19f938810aa020954"}, - {file = "MarkupSafe-2.1.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b835aba863195269ea358cecc21b400276747cc977492319fd7682b8cd2c253d"}, - {file = "MarkupSafe-2.1.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23984d1bdae01bee794267424af55eef4dfc038dc5d1272860669b2aa025c9e3"}, - {file = "MarkupSafe-2.1.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c98c33ffe20e9a489145d97070a435ea0679fddaabcafe19982fe9c971987d5"}, - {file = "MarkupSafe-2.1.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9896fca4a8eb246defc8b2a7ac77ef7553b638e04fbf170bff78a40fa8a91474"}, - {file = "MarkupSafe-2.1.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:b0fe73bac2fed83839dbdbe6da84ae2a31c11cfc1c777a40dbd8ac8a6ed1560f"}, - {file = "MarkupSafe-2.1.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c7556bafeaa0a50e2fe7dc86e0382dea349ebcad8f010d5a7dc6ba568eaaa789"}, - {file = "MarkupSafe-2.1.4-cp38-cp38-win32.whl", hash = "sha256:fc1a75aa8f11b87910ffd98de62b29d6520b6d6e8a3de69a70ca34dea85d2a8a"}, - {file = "MarkupSafe-2.1.4-cp38-cp38-win_amd64.whl", hash = "sha256:3a66c36a3864df95e4f62f9167c734b3b1192cb0851b43d7cc08040c074c6279"}, - {file = "MarkupSafe-2.1.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:765f036a3d00395a326df2835d8f86b637dbaf9832f90f5d196c3b8a7a5080cb"}, - {file = "MarkupSafe-2.1.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:21e7af8091007bf4bebf4521184f4880a6acab8df0df52ef9e513d8e5db23411"}, - {file = "MarkupSafe-2.1.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5c31fe855c77cad679b302aabc42d724ed87c043b1432d457f4976add1c2c3e"}, - {file = "MarkupSafe-2.1.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7653fa39578957bc42e5ebc15cf4361d9e0ee4b702d7d5ec96cdac860953c5b4"}, - {file = "MarkupSafe-2.1.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:47bb5f0142b8b64ed1399b6b60f700a580335c8e1c57f2f15587bd072012decc"}, - {file = "MarkupSafe-2.1.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:fe8512ed897d5daf089e5bd010c3dc03bb1bdae00b35588c49b98268d4a01e00"}, - {file = "MarkupSafe-2.1.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:36d7626a8cca4d34216875aee5a1d3d654bb3dac201c1c003d182283e3205949"}, - {file = "MarkupSafe-2.1.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b6f14a9cd50c3cb100eb94b3273131c80d102e19bb20253ac7bd7336118a673a"}, - {file = "MarkupSafe-2.1.4-cp39-cp39-win32.whl", hash = "sha256:c8f253a84dbd2c63c19590fa86a032ef3d8cc18923b8049d91bcdeeb2581fbf6"}, - {file = "MarkupSafe-2.1.4-cp39-cp39-win_amd64.whl", hash = "sha256:8b570a1537367b52396e53325769608f2a687ec9a4363647af1cded8928af959"}, - {file = "MarkupSafe-2.1.4.tar.gz", hash = "sha256:3aae9af4cac263007fd6309c64c6ab4506dd2b79382d9d19a1994f9240b8db4f"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-win32.whl", hash = "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5"}, + {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"}, ] [[package]] @@ -755,36 +752,36 @@ test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"] [[package]] name = "numba" -version = "0.58.1" +version = "0.59.0" description = "compiling Python code using LLVM" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "numba-0.58.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:07f2fa7e7144aa6f275f27260e73ce0d808d3c62b30cff8906ad1dec12d87bbe"}, - {file = "numba-0.58.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7bf1ddd4f7b9c2306de0384bf3854cac3edd7b4d8dffae2ec1b925e4c436233f"}, - {file = "numba-0.58.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bc2d904d0319d7a5857bd65062340bed627f5bfe9ae4a495aef342f072880d50"}, - {file = "numba-0.58.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4e79b6cc0d2bf064a955934a2e02bf676bc7995ab2db929dbbc62e4c16551be6"}, - {file = "numba-0.58.1-cp310-cp310-win_amd64.whl", hash = "sha256:81fe5b51532478149b5081311b0fd4206959174e660c372b94ed5364cfb37c82"}, - {file = "numba-0.58.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bcecd3fb9df36554b342140a4d77d938a549be635d64caf8bd9ef6c47a47f8aa"}, - {file = "numba-0.58.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a1eaa744f518bbd60e1f7ccddfb8002b3d06bd865b94a5d7eac25028efe0e0ff"}, - {file = "numba-0.58.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bf68df9c307fb0aa81cacd33faccd6e419496fdc621e83f1efce35cdc5e79cac"}, - {file = "numba-0.58.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:55a01e1881120e86d54efdff1be08381886fe9f04fc3006af309c602a72bc44d"}, - {file = "numba-0.58.1-cp311-cp311-win_amd64.whl", hash = "sha256:811305d5dc40ae43c3ace5b192c670c358a89a4d2ae4f86d1665003798ea7a1a"}, - {file = "numba-0.58.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ea5bfcf7d641d351c6a80e8e1826eb4a145d619870016eeaf20bbd71ef5caa22"}, - {file = "numba-0.58.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e63d6aacaae1ba4ef3695f1c2122b30fa3d8ba039c8f517784668075856d79e2"}, - {file = "numba-0.58.1-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6fe7a9d8e3bd996fbe5eac0683227ccef26cba98dae6e5cee2c1894d4b9f16c1"}, - {file = "numba-0.58.1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:898af055b03f09d33a587e9425500e5be84fc90cd2f80b3fb71c6a4a17a7e354"}, - {file = "numba-0.58.1-cp38-cp38-win_amd64.whl", hash = "sha256:d3e2fe81fe9a59fcd99cc572002101119059d64d31eb6324995ee8b0f144a306"}, - {file = "numba-0.58.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5c765aef472a9406a97ea9782116335ad4f9ef5c9f93fc05fd44aab0db486954"}, - {file = "numba-0.58.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9e9356e943617f5e35a74bf56ff6e7cc83e6b1865d5e13cee535d79bf2cae954"}, - {file = "numba-0.58.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:240e7a1ae80eb6b14061dc91263b99dc8d6af9ea45d310751b780888097c1aaa"}, - {file = "numba-0.58.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:45698b995914003f890ad839cfc909eeb9c74921849c712a05405d1a79c50f68"}, - {file = "numba-0.58.1-cp39-cp39-win_amd64.whl", hash = "sha256:bd3dda77955be03ff366eebbfdb39919ce7c2620d86c906203bed92124989032"}, - {file = "numba-0.58.1.tar.gz", hash = "sha256:487ded0633efccd9ca3a46364b40006dbdaca0f95e99b8b83e778d1195ebcbaa"}, + {file = "numba-0.59.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8d061d800473fb8fef76a455221f4ad649a53f5e0f96e3f6c8b8553ee6fa98fa"}, + {file = "numba-0.59.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c086a434e7d3891ce5dfd3d1e7ee8102ac1e733962098578b507864120559ceb"}, + {file = "numba-0.59.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9e20736bf62e61f8353fb71b0d3a1efba636c7a303d511600fc57648b55823ed"}, + {file = "numba-0.59.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e86e6786aec31d2002122199486e10bbc0dc40f78d76364cded375912b13614c"}, + {file = "numba-0.59.0-cp310-cp310-win_amd64.whl", hash = "sha256:0307ee91b24500bb7e64d8a109848baf3a3905df48ce142b8ac60aaa406a0400"}, + {file = "numba-0.59.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d540f69a8245fb714419c2209e9af6104e568eb97623adc8943642e61f5d6d8e"}, + {file = "numba-0.59.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1192d6b2906bf3ff72b1d97458724d98860ab86a91abdd4cfd9328432b661e31"}, + {file = "numba-0.59.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:90efb436d3413809fcd15298c6d395cb7d98184350472588356ccf19db9e37c8"}, + {file = "numba-0.59.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cd3dac45e25d927dcb65d44fb3a973994f5add2b15add13337844afe669dd1ba"}, + {file = "numba-0.59.0-cp311-cp311-win_amd64.whl", hash = "sha256:753dc601a159861808cc3207bad5c17724d3b69552fd22768fddbf302a817a4c"}, + {file = "numba-0.59.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ce62bc0e6dd5264e7ff7f34f41786889fa81a6b860662f824aa7532537a7bee0"}, + {file = "numba-0.59.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8cbef55b73741b5eea2dbaf1b0590b14977ca95a13a07d200b794f8f6833a01c"}, + {file = "numba-0.59.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:70d26ba589f764be45ea8c272caa467dbe882b9676f6749fe6f42678091f5f21"}, + {file = "numba-0.59.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e125f7d69968118c28ec0eed9fbedd75440e64214b8d2eac033c22c04db48492"}, + {file = "numba-0.59.0-cp312-cp312-win_amd64.whl", hash = "sha256:4981659220b61a03c1e557654027d271f56f3087448967a55c79a0e5f926de62"}, + {file = "numba-0.59.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fe4d7562d1eed754a7511ed7ba962067f198f86909741c5c6e18c4f1819b1f47"}, + {file = "numba-0.59.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6feb1504bb432280f900deaf4b1dadcee68812209500ed3f81c375cbceab24dc"}, + {file = "numba-0.59.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:944faad25ee23ea9dda582bfb0189fb9f4fc232359a80ab2a028b94c14ce2b1d"}, + {file = "numba-0.59.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5516a469514bfae52a9d7989db4940653a5cbfac106f44cb9c50133b7ad6224b"}, + {file = "numba-0.59.0-cp39-cp39-win_amd64.whl", hash = "sha256:32bd0a41525ec0b1b853da244808f4e5333867df3c43c30c33f89cf20b9c2b63"}, + {file = "numba-0.59.0.tar.gz", hash = "sha256:12b9b064a3e4ad00e2371fc5212ef0396c80f41caec9b5ec391c8b04b6eaf2a8"}, ] [package.dependencies] -llvmlite = "==0.41.*" +llvmlite = "==0.42.*" numpy = ">=1.22,<1.27" [[package]] @@ -943,12 +940,12 @@ nvidia-nvjitlink-cu12 = "*" [[package]] name = "nvidia-nccl-cu12" -version = "2.18.1" +version = "2.19.3" description = "NVIDIA Collective Communication Library (NCCL) Runtime" optional = false python-versions = ">=3" files = [ - {file = "nvidia_nccl_cu12-2.18.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:1a6c4acefcbebfa6de320f412bf7866de856e786e0462326ba1bac40de0b5e71"}, + {file = "nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl", hash = "sha256:a9734707a2c96443331c1e48c717024aa6678a0e2a4cb66b2c364d18cee6b48d"}, ] [[package]] @@ -1036,35 +1033,36 @@ dev = ["Pillow", "black", "googledrivedownloader", "isort", "onnxruntime", "pre- [[package]] name = "onnxruntime" -version = "1.16.3" +version = "1.17.0" description = "ONNX Runtime is a runtime accelerator for Machine Learning models" optional = true python-versions = "*" files = [ - {file = "onnxruntime-1.16.3-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:3bc41f323ac77acfed190be8ffdc47a6a75e4beeb3473fbf55eeb075ccca8df2"}, - {file = "onnxruntime-1.16.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:212741b519ee61a4822c79c47147d63a8b0ffde25cd33988d3d7be9fbd51005d"}, - {file = "onnxruntime-1.16.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f91f5497fe3df4ceee2f9e66c6148d9bfeb320cd6a71df361c66c5b8bac985a"}, - {file = "onnxruntime-1.16.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef2b1fc269cabd27f129fb9058917d6fdc89b188c49ed8700f300b945c81f889"}, - {file = "onnxruntime-1.16.3-cp310-cp310-win32.whl", hash = "sha256:f36b56a593b49a3c430be008c2aea6658d91a3030115729609ec1d5ffbaab1b6"}, - {file = "onnxruntime-1.16.3-cp310-cp310-win_amd64.whl", hash = "sha256:3c467eaa3d2429c026b10c3d17b78b7f311f718ef9d2a0d6938e5c3c2611b0cf"}, - {file = "onnxruntime-1.16.3-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:a225bb683991001d111f75323d355b3590e75e16b5e0f07a0401e741a0143ea1"}, - {file = "onnxruntime-1.16.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9aded21fe3d898edd86be8aa2eb995aa375e800ad3dfe4be9f618a20b8ee3630"}, - {file = "onnxruntime-1.16.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00cccc37a5195c8fca5011b9690b349db435986bd508eb44c9fce432da9228a4"}, - {file = "onnxruntime-1.16.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e253e572021563226a86f1c024f8f70cdae28f2fb1cc8c3a9221e8b1ce37db5"}, - {file = "onnxruntime-1.16.3-cp311-cp311-win32.whl", hash = "sha256:a82a8f0b4c978d08f9f5c7a6019ae51151bced9fd91e5aaa0c20a9e4ac7a60b6"}, - {file = "onnxruntime-1.16.3-cp311-cp311-win_amd64.whl", hash = "sha256:78d81d9af457a1dc90db9a7da0d09f3ccb1288ea1236c6ab19f0ca61f3eee2d3"}, - {file = "onnxruntime-1.16.3-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:04ebcd29c20473596a1412e471524b2fb88d55e6301c40b98dd2407b5911595f"}, - {file = "onnxruntime-1.16.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9996bab0f202a6435ab867bc55598f15210d0b72794d5de83712b53d564084ae"}, - {file = "onnxruntime-1.16.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b8f5083f903408238883821dd8c775f8120cb4a604166dbdabe97f4715256d5"}, - {file = "onnxruntime-1.16.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c2dcf1b70f8434abb1116fe0975c00e740722aaf321997195ea3618cc00558e"}, - {file = "onnxruntime-1.16.3-cp38-cp38-win32.whl", hash = "sha256:d4a0151e1accd04da6711f6fd89024509602f82c65a754498e960b032359b02d"}, - {file = "onnxruntime-1.16.3-cp38-cp38-win_amd64.whl", hash = "sha256:e8aa5bba78afbd4d8a2654b14ec7462ff3ce4a6aad312a3c2d2c2b65009f2541"}, - {file = "onnxruntime-1.16.3-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:6829dc2a79d48c911fedaf4c0f01e03c86297d32718a3fdee7a282766dfd282a"}, - {file = "onnxruntime-1.16.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:76f876c53bfa912c6c242fc38213a6f13f47612d4360bc9d599bd23753e53161"}, - {file = "onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4137e5d443e2dccebe5e156a47f1d6d66f8077b03587c35f11ee0c7eda98b533"}, - {file = "onnxruntime-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c56695c1a343c7c008b647fff3df44da63741fbe7b6003ef576758640719be7b"}, - {file = "onnxruntime-1.16.3-cp39-cp39-win32.whl", hash = "sha256:985a029798744ce4743fcf8442240fed35c8e4d4d30ec7d0c2cdf1388cd44408"}, - {file = "onnxruntime-1.16.3-cp39-cp39-win_amd64.whl", hash = "sha256:28ff758b17ce3ca6bcad3d936ec53bd7f5482e7630a13f6dcae518eba8f71d85"}, + {file = "onnxruntime-1.17.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:d2b22a25a94109cc983443116da8d9805ced0256eb215c5e6bc6dcbabefeab96"}, + {file = "onnxruntime-1.17.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b4c87d83c6f58d1af2675fc99e3dc810f2dbdb844bcefd0c1b7573632661f6fc"}, + {file = "onnxruntime-1.17.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dba55723bf9b835e358f48c98a814b41692c393eb11f51e02ece0625c756b797"}, + {file = "onnxruntime-1.17.0-cp310-cp310-win32.whl", hash = "sha256:ee48422349cc500273beea7607e33c2237909f58468ae1d6cccfc4aecd158565"}, + {file = "onnxruntime-1.17.0-cp310-cp310-win_amd64.whl", hash = "sha256:f34cc46553359293854e38bdae2ab1be59543aad78a6317e7746d30e311110c3"}, + {file = "onnxruntime-1.17.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:16d26badd092c8c257fa57c458bb600d96dc15282c647ccad0ed7b2732e6c03b"}, + {file = "onnxruntime-1.17.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6f1273bebcdb47ed932d076c85eb9488bc4768fcea16d5f2747ca692fad4f9d3"}, + {file = "onnxruntime-1.17.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cb60fd3c2c1acd684752eb9680e89ae223e9801a9b0e0dc7b28adabe45a2e380"}, + {file = "onnxruntime-1.17.0-cp311-cp311-win32.whl", hash = "sha256:4b038324586bc905299e435f7c00007e6242389c856b82fe9357fdc3b1ef2bdc"}, + {file = "onnxruntime-1.17.0-cp311-cp311-win_amd64.whl", hash = "sha256:93d39b3fa1ee01f034f098e1c7769a811a21365b4883f05f96c14a2b60c6028b"}, + {file = "onnxruntime-1.17.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:90c0890e36f880281c6c698d9bc3de2afbeee2f76512725ec043665c25c67d21"}, + {file = "onnxruntime-1.17.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7466724e809a40e986b1637cba156ad9fc0d1952468bc00f79ef340bc0199552"}, + {file = "onnxruntime-1.17.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d47bee7557a8b99c8681b6882657a515a4199778d6d5e24e924d2aafcef55b0a"}, + {file = "onnxruntime-1.17.0-cp312-cp312-win32.whl", hash = "sha256:bb1bf1ee575c665b8bbc3813ab906e091a645a24ccc210be7932154b8260eca1"}, + {file = "onnxruntime-1.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:ac2f286da3494b29b4186ca193c7d4e6a2c1f770c4184c7192c5da142c3dec28"}, + {file = "onnxruntime-1.17.0-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:1ec485643b93e0a3896c655eb2426decd63e18a278bb7ccebc133b340723624f"}, + {file = "onnxruntime-1.17.0-cp38-cp38-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:83c35809cda898c5a11911c69ceac8a2ac3925911854c526f73bad884582f911"}, + {file = "onnxruntime-1.17.0-cp38-cp38-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fa464aa4d81df818375239e481887b656e261377d5b6b9a4692466f5f3261edc"}, + {file = "onnxruntime-1.17.0-cp38-cp38-win32.whl", hash = "sha256:b7b337cd0586f7836601623cbd30a443df9528ef23965860d11c753ceeb009f2"}, + {file = "onnxruntime-1.17.0-cp38-cp38-win_amd64.whl", hash = "sha256:fbb9faaf51d01aa2c147ef52524d9326744c852116d8005b9041809a71838878"}, + {file = "onnxruntime-1.17.0-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:5a06ab84eaa350bf64b1d747b33ccf10da64221ed1f38f7287f15eccbec81603"}, + {file = "onnxruntime-1.17.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d3d11db2c8242766212a68d0b139745157da7ce53bd96ba349a5c65e5a02357"}, + {file = "onnxruntime-1.17.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5632077c3ab8b0cd4f74b0af9c4e924be012b1a7bcd7daa845763c6c6bf14b7d"}, + {file = "onnxruntime-1.17.0-cp39-cp39-win32.whl", hash = "sha256:61a12732cba869b3ad2d4e29ab6cb62c7a96f61b8c213f7fcb961ba412b70b37"}, + {file = "onnxruntime-1.17.0-cp39-cp39-win_amd64.whl", hash = "sha256:461fa0fc7d9c392c352b6cccdedf44d818430f3d6eacd924bb804fdea2dcfd02"}, ] [package.dependencies] @@ -1077,19 +1075,21 @@ sympy = "*" [[package]] name = "onnxruntime-gpu" -version = "1.16.3" +version = "1.17.0" description = "ONNX Runtime is a runtime accelerator for Machine Learning models" optional = true python-versions = "*" files = [ - {file = "onnxruntime_gpu-1.16.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c14bc735ad2b2286be9eadeea09bc190df38e8bce17e37b601761019cc7cc24f"}, - {file = "onnxruntime_gpu-1.16.3-cp310-cp310-win_amd64.whl", hash = "sha256:8de5ccfc005ea5ec50fbd104b7210c97623a9f8c13de6e64ce559b55956b757f"}, - {file = "onnxruntime_gpu-1.16.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5703454521a9c080ff3ac79b5d266e959cc735d442a1d8796763c7f92d6069dc"}, - {file = "onnxruntime_gpu-1.16.3-cp311-cp311-win_amd64.whl", hash = "sha256:48bb615aed61f5620d1ad46b9005614e1a14de60f8218a1448cc9a643f23d399"}, - {file = "onnxruntime_gpu-1.16.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2811c8ea209aaedcc2600ca828025279c1b1242344af603122d28c2ea8ab26a4"}, - {file = "onnxruntime_gpu-1.16.3-cp38-cp38-win_amd64.whl", hash = "sha256:2e5a92770c9232776739f378804bf6fea20bae02878a50b7fe0f81e77a47ee92"}, - {file = "onnxruntime_gpu-1.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9305c7fc5981d7e04ad2afef1a403475fb84d658898567c91aa5a41c20ead356"}, - {file = "onnxruntime_gpu-1.16.3-cp39-cp39-win_amd64.whl", hash = "sha256:d3ad8e7fbb22493267c23d61e997a6b2ac6236a08aa6b58a3a91848124c9b037"}, + {file = "onnxruntime_gpu-1.17.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:1f2a4e0468ac0bd8246996c3d5dbba92cbbaca874bcd7f9cee4e99ce6eb27f5b"}, + {file = "onnxruntime_gpu-1.17.0-cp310-cp310-win_amd64.whl", hash = "sha256:0721b7930d7abed3730b2335e639e60d94ec411bb4d35a0347cc9c8b52c34540"}, + {file = "onnxruntime_gpu-1.17.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:be0314afe399943904de7c1ca797cbcc63e6fad60eb85d3df6422f81dd94e79e"}, + {file = "onnxruntime_gpu-1.17.0-cp311-cp311-win_amd64.whl", hash = "sha256:52125c24b21406d1431e43de1c98cea29c21e0cceba80db530b7e4c9216d86ea"}, + {file = "onnxruntime_gpu-1.17.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:bb802d8033885c412269f8bc8877d8779b0dc874df6fb9df8b796cba7276ad66"}, + {file = "onnxruntime_gpu-1.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:8c43533e3e5335eaa78059fb86b849a4faded513a00c1feaaa205ca5af51c40f"}, + {file = "onnxruntime_gpu-1.17.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:1d461455bba160836d6c11c648c8fd4e4500d5c17096a13e6c2c9d22a4abd436"}, + {file = "onnxruntime_gpu-1.17.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4398f2175a92f4b35d95279a6294a89c462f24de058a2736ee1d498bab5a16"}, + {file = "onnxruntime_gpu-1.17.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1d0e3805cd1c024aba7f4ae576fd08545fc27530a2aaad2b3c8ac0ee889fbd05"}, + {file = "onnxruntime_gpu-1.17.0-cp39-cp39-win_amd64.whl", hash = "sha256:fc1da5b93363ee600b5b220b04eeec51ad2c2b3e96f0b7615b16b8a173c88001"}, ] [package.dependencies] @@ -1230,18 +1230,18 @@ xmp = ["defusedxml"] [[package]] name = "platformdirs" -version = "4.1.0" +version = "4.2.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." optional = false python-versions = ">=3.8" files = [ - {file = "platformdirs-4.1.0-py3-none-any.whl", hash = "sha256:11c8f37bcca40db96d8144522d925583bdb7a31f7b0e37e3ed4318400a8e2380"}, - {file = "platformdirs-4.1.0.tar.gz", hash = "sha256:906d548203468492d432bcb294d4bc2fff751bf84971fbb2c10918cc206ee420"}, + {file = "platformdirs-4.2.0-py3-none-any.whl", hash = "sha256:0614df2a2f37e1a662acbd8e2b25b92ccf8632929bc6d43467e17fe89c75e068"}, + {file = "platformdirs-4.2.0.tar.gz", hash = "sha256:ef0cc731df711022c174543cb70a9b5bd22e5a9337c8624ef2c2ceb8ddad8768"}, ] [package.extras] -docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"] -test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"] +docs = ["furo (>=2023.9.10)", "proselint (>=0.13)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)"] [[package]] name = "pluggy" @@ -1615,31 +1615,36 @@ files = [ [[package]] name = "torch" -version = "2.1.2" +version = "2.2.0" description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" optional = false python-versions = ">=3.8.0" files = [ - {file = "torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:3a871edd6c02dae77ad810335c0833391c1a4ce49af21ea8cf0f6a5d2096eea8"}, - {file = "torch-2.1.2-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:bef6996c27d8f6e92ea4e13a772d89611da0e103b48790de78131e308cf73076"}, - {file = "torch-2.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:0e13034fd5fb323cbbc29e56d0637a3791e50dd589616f40c79adfa36a5a35a1"}, - {file = "torch-2.1.2-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:d9b535cad0df3d13997dbe8bd68ac33e0e3ae5377639c9881948e40794a61403"}, - {file = "torch-2.1.2-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:f9a55d55af02826ebfbadf4e9b682f0f27766bc33df8236b48d28d705587868f"}, - {file = "torch-2.1.2-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:a6ebbe517097ef289cc7952783588c72de071d4b15ce0f8b285093f0916b1162"}, - {file = "torch-2.1.2-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:8f32ce591616a30304f37a7d5ea80b69ca9e1b94bba7f308184bf616fdaea155"}, - {file = "torch-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:e0ee6cf90c8970e05760f898d58f9ac65821c37ffe8b04269ec787aa70962b69"}, - {file = "torch-2.1.2-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:76d37967c31c99548ad2c4d3f2cf191db48476f2e69b35a0937137116da356a1"}, - {file = "torch-2.1.2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:e2d83f07b4aac983453ea5bf8f9aa9dacf2278a8d31247f5d9037f37befc60e4"}, - {file = "torch-2.1.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:f41fe0c7ecbf903a568c73486139a75cfab287a0f6c17ed0698fdea7a1e8641d"}, - {file = "torch-2.1.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:e3225f47d50bb66f756fe9196a768055d1c26b02154eb1f770ce47a2578d3aa7"}, - {file = "torch-2.1.2-cp38-cp38-win_amd64.whl", hash = "sha256:33d59cd03cb60106857f6c26b36457793637512998666ee3ce17311f217afe2b"}, - {file = "torch-2.1.2-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:8e221deccd0def6c2badff6be403e0c53491805ed9915e2c029adbcdb87ab6b5"}, - {file = "torch-2.1.2-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:05b18594f60a911a0c4f023f38a8bda77131fba5fd741bda626e97dcf5a3dd0a"}, - {file = "torch-2.1.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:9ca96253b761e9aaf8e06fb30a66ee301aecbf15bb5a303097de1969077620b6"}, - {file = "torch-2.1.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:d93ba70f67b08c2ae5598ee711cbc546a1bc8102cef938904b8c85c2089a51a0"}, - {file = "torch-2.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:255b50bc0608db177e6a3cc118961d77de7e5105f07816585fa6f191f33a9ff3"}, - {file = "torch-2.1.2-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:6984cd5057c0c977b3c9757254e989d3f1124f4ce9d07caa6cb637783c71d42a"}, - {file = "torch-2.1.2-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:bc195d7927feabc0eb7c110e457c955ed2ab616f3c7c28439dd4188cf589699f"}, + {file = "torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:d366158d6503a3447e67f8c0ad1328d54e6c181d88572d688a625fac61b13a97"}, + {file = "torch-2.2.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:707f2f80402981e9f90d0038d7d481678586251e6642a7a6ef67fc93511cb446"}, + {file = "torch-2.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:15c8f0a105c66b28496092fca1520346082e734095f8eaf47b5786bac24b8a31"}, + {file = "torch-2.2.0-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:0ca4df4b728515ad009b79f5107b00bcb2c63dc202d991412b9eb3b6a4f24349"}, + {file = "torch-2.2.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:3d3eea2d5969b9a1c9401429ca79efc668120314d443d3463edc3289d7f003c7"}, + {file = "torch-2.2.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:0d1c580e379c0d48f0f0a08ea28d8e373295aa254de4f9ad0631f9ed8bc04c24"}, + {file = "torch-2.2.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:9328e3c1ce628a281d2707526b4d1080eae7c4afab4f81cea75bde1f9441dc78"}, + {file = "torch-2.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:03c8e660907ac1b8ee07f6d929c4e15cd95be2fb764368799cca02c725a212b8"}, + {file = "torch-2.2.0-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:da0cefe7f84ece3e3b56c11c773b59d1cb2c0fd83ddf6b5f7f1fd1a987b15c3e"}, + {file = "torch-2.2.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:f81d23227034221a4a4ff8ef24cc6cec7901edd98d9e64e32822778ff01be85e"}, + {file = "torch-2.2.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:dcbfb2192ac41ca93c756ebe9e2af29df0a4c14ee0e7a0dd78f82c67a63d91d4"}, + {file = "torch-2.2.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:9eeb42971619e24392c9088b5b6d387d896e267889d41d267b1fec334f5227c5"}, + {file = "torch-2.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:c718b2ca69a6cac28baa36d86d8c0ec708b102cebd1ceb1b6488e404cd9be1d1"}, + {file = "torch-2.2.0-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:f11d18fceb4f9ecb1ac680dde7c463c120ed29056225d75469c19637e9f98d12"}, + {file = "torch-2.2.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:ee1da852bfd4a7e674135a446d6074c2da7194c1b08549e31eae0b3138c6b4d2"}, + {file = "torch-2.2.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:0d819399819d0862268ac531cf12a501c253007df4f9e6709ede8a0148f1a7b8"}, + {file = "torch-2.2.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:08f53ccc38c49d839bc703ea1b20769cc8a429e0c4b20b56921a9f64949bf325"}, + {file = "torch-2.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:93bffe3779965a71dab25fc29787538c37c5d54298fd2f2369e372b6fb137d41"}, + {file = "torch-2.2.0-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:c17ec323da778efe8dad49d8fb534381479ca37af1bfc58efdbb8607a9d263a3"}, + {file = "torch-2.2.0-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:c02685118008834e878f676f81eab3a952b7936fa31f474ef8a5ff4b5c78b36d"}, + {file = "torch-2.2.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:d9f39d6f53cec240a0e3baa82cb697593340f9d4554cee6d3d6ca07925c2fac0"}, + {file = "torch-2.2.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:51770c065206250dc1222ea7c0eff3f88ab317d3e931cca2aee461b85fbc2472"}, + {file = "torch-2.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:008e4c6ad703de55af760c73bf937ecdd61a109f9b08f2bbb9c17e7c7017f194"}, + {file = "torch-2.2.0-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:de8680472dd14e316f42ceef2a18a301461a9058cd6e99a1f1b20f78f11412f1"}, + {file = "torch-2.2.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:99e1dcecb488e3fd25bcaac56e48cdb3539842904bdc8588b0b255fde03a254c"}, ] [package.dependencies] @@ -1656,78 +1661,101 @@ nvidia-cufft-cu12 = {version = "11.0.2.54", markers = "platform_system == \"Linu nvidia-curand-cu12 = {version = "10.3.2.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} nvidia-cusolver-cu12 = {version = "11.4.5.107", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} nvidia-cusparse-cu12 = {version = "12.1.0.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-nccl-cu12 = {version = "2.18.1", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-nccl-cu12 = {version = "2.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} nvidia-nvtx-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} sympy = "*" -triton = {version = "2.1.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -typing-extensions = "*" +triton = {version = "2.2.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +typing-extensions = ">=4.8.0" [package.extras] -dynamo = ["jinja2"] opt-einsum = ["opt-einsum (>=3.3)"] +optree = ["optree (>=0.9.1)"] [[package]] name = "torchvision" -version = "0.16.2" +version = "0.17.0" description = "image and video datasets and models for torch deep learning" optional = false python-versions = ">=3.8" files = [ - {file = "torchvision-0.16.2-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:bc86f2800cb2c0c1a09c581409cdd6bff66e62f103dc83fc63f73346264c3756"}, - {file = "torchvision-0.16.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b024bd412df6d3a007dcebf311a894eb3c5c21e1af80d12be382bbcb097a7c3a"}, - {file = "torchvision-0.16.2-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:e89f10f3c8351972b6e3fda95bc3e479ea8dbfc9dfcfd2c32902dbad4ba5cfc5"}, - {file = "torchvision-0.16.2-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:96c7583700112a410bdc4e1e4f118c429dab49c29c9a31a2cc3579bc9b08b19d"}, - {file = "torchvision-0.16.2-cp310-cp310-win_amd64.whl", hash = "sha256:9f4032ebb3277fb07ff6a9b818d50a547fb8fcd89d958cfd9e773322454bb688"}, - {file = "torchvision-0.16.2-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:67b1aaf8b8cb02ce75dd445f291a27c8036a502f8c0aa76e28c37a0faac2e153"}, - {file = "torchvision-0.16.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bef30d03e1d1c629761f4dca51d3b7d8a0dc0acce6f4068ab2a1634e8e7b64e0"}, - {file = "torchvision-0.16.2-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:e59cc7b2bd1ab5c0ce4ae382e4e37be8f1c174e8b5de2f6a23c170de9ae28495"}, - {file = "torchvision-0.16.2-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:e130b08cc9b3cc73a6c59d6edf032394a322f9579bfd21d14bc2e1d0999aa758"}, - {file = "torchvision-0.16.2-cp311-cp311-win_amd64.whl", hash = "sha256:8692ab1e48807e9604046a6f4beeb67b523294cee1b00828654bb0df2cfce2b2"}, - {file = "torchvision-0.16.2-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:b82732dcf876a37c852772342aa6ee3480c03bb3e2a802ae109fc5f7e28d26e9"}, - {file = "torchvision-0.16.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4b065143d1a720fe8a9077fd4be35d491f98819ec80b3dbbc3ec64d0b707a906"}, - {file = "torchvision-0.16.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:bc5f274e4ecd1b86062063cdf4fd385a1d39d147a3a2685fbbde9ff08bb720b8"}, - {file = "torchvision-0.16.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:335959c43b371c0474af34c1ef2a52efdc7603c45700d29e4475eeb02984170c"}, - {file = "torchvision-0.16.2-cp38-cp38-win_amd64.whl", hash = "sha256:7fd22d86e08eba321af70cad291020c2cdeac069b00ce88b923ca52e06174769"}, - {file = "torchvision-0.16.2-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:56115268b37f0b75364e3654e47ad9abc66ac34c1f9e5e3dfa89a22d6a40017a"}, - {file = "torchvision-0.16.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:82805f8445b094f9d1e770390ee6cc86855e89955e08ce34af2e2274fc0e5c45"}, - {file = "torchvision-0.16.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:3f4bd5fcbc361476e2e78016636ac7d5509e59d9962521f06eb98e6803898182"}, - {file = "torchvision-0.16.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:8199acdf8ab066a28b84a5b6f4d97b58976d9e164b1acc3a9d14fccfaf74bb3a"}, - {file = "torchvision-0.16.2-cp39-cp39-win_amd64.whl", hash = "sha256:41dd4fa9f176d563fe9f1b9adef3b7e582cdfb60ce8c9bc51b094a025be687c9"}, + {file = "torchvision-0.17.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:153882cd8ff8e3dbef5c5054fdd15df64e85420546805a90c0b2221f2f119c4a"}, + {file = "torchvision-0.17.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c55c2f86e3f3a21ddd92739a972366244e9b17916e836ec47167b0a0c083c65f"}, + {file = "torchvision-0.17.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:605950cdcefe6c5aef85709ade17b1525bcf171e122cce1df09e666d96525b90"}, + {file = "torchvision-0.17.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:3d86c212fc6379e9bec3ac647d062e34c2cf36c26b98840b66573eb9fbe1f1d9"}, + {file = "torchvision-0.17.0-cp310-cp310-win_amd64.whl", hash = "sha256:71b314813faf13cecb09a4a635b5e4b274e8df0b1921681038d491c529555bb6"}, + {file = "torchvision-0.17.0-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:10d276821f115fb369e6cf1f1b77b2cca60cda12cbb39a41513a9d3d0f2a93ae"}, + {file = "torchvision-0.17.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3eef2daddadb5c21e802e0550dd7e3ee3d98c430f4aed212ae3ba0358558be1"}, + {file = "torchvision-0.17.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:acc0d098ab8c295a750f0218bf5bf7bfc2f2c21f9c2fe3fc30b695cd94f4c759"}, + {file = "torchvision-0.17.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:3d2e9552d72e4037f2db6f7d97989a2e2f95763aa1861963a3faf521bb1610c4"}, + {file = "torchvision-0.17.0-cp311-cp311-win_amd64.whl", hash = "sha256:f8e542cf71e1294fcb5635038eae6702df543dc90706f0836ec80e75efc511fc"}, + {file = "torchvision-0.17.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:816ae1a4506b1cb0f638e1827cae7ab768c731369ab23e86839f177926197143"}, + {file = "torchvision-0.17.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:be39874c239215a39b3c431c7016501f1a45bfbbebf2fe8e11d8339b5ea23bca"}, + {file = "torchvision-0.17.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:8fe14d580557aef2c45dd462c069ff936b6507b215c4b496f30973ae8cff917d"}, + {file = "torchvision-0.17.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:4608ba3246c45c968ede40e7640e4eed64556210faa154cf1ffccb1cadabe445"}, + {file = "torchvision-0.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:b755d6d3e021239d2408bf3794d0d3dcffbc629f1fd808c43d8b346045a098c4"}, + {file = "torchvision-0.17.0-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:870d7cda57420e44d20eb07bfe37bf5344a06434a7a6195b4c7f3dd55838587d"}, + {file = "torchvision-0.17.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:477f6e64a9d798c0f5adefc300acc220da6f17ef5c1e110d20108f66554fee4d"}, + {file = "torchvision-0.17.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:a54a15bd6f3dbb04ebd36c5a87530b2e090ee4b9b15eb89eda558ab3e50396a0"}, + {file = "torchvision-0.17.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:e041ce3336364413bab051a3966d884bab25c200f98ca8a065f0abe758c3005e"}, + {file = "torchvision-0.17.0-cp38-cp38-win_amd64.whl", hash = "sha256:7887f767670c72aa20f5237042d0ca1462da18f66a3ea8c36b6ba67ce26b82fc"}, + {file = "torchvision-0.17.0-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:b1ced438b81ef662a71c8c81debaf0c80455b35b811ca55a4c3c593d721b560a"}, + {file = "torchvision-0.17.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b53569c52bd4bd1176a1e49d8ea55883bcf57e1614cb97e2e8ce372768299b70"}, + {file = "torchvision-0.17.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:7f373507afcd9022ebd9f50b31da8dbac1ea6783ffb77d1f1ab8806425c0a83b"}, + {file = "torchvision-0.17.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:085251ab36340206dc7e1be59a15fa5e307d45ccd66889f5d7bf1ba5e7ecdc57"}, + {file = "torchvision-0.17.0-cp39-cp39-win_amd64.whl", hash = "sha256:4c0d4c0af58af2752aad235150bd794d0f324e6eeac5cd13c440bda5dce622d3"}, ] [package.dependencies] numpy = "*" pillow = ">=5.3.0,<8.3.dev0 || >=8.4.dev0" requests = "*" -torch = "2.1.2" +torch = "2.2.0" [package.extras] scipy = ["scipy"] +[[package]] +name = "tqdm" +version = "4.66.1" +description = "Fast, Extensible Progress Meter" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"}, + {file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + [[package]] name = "triton" -version = "2.1.0" +version = "2.2.0" description = "A language and compiler for custom Deep Learning operations" optional = false python-versions = "*" files = [ - {file = "triton-2.1.0-0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:66439923a30d5d48399b08a9eae10370f6c261a5ec864a64983bae63152d39d7"}, - {file = "triton-2.1.0-0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:919b06453f0033ea52c13eaf7833de0e57db3178d23d4e04f9fc71c4f2c32bf8"}, - {file = "triton-2.1.0-0-cp37-cp37m-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ae4bb8a91de790e1866405211c4d618379781188f40d5c4c399766914e84cd94"}, - {file = "triton-2.1.0-0-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:39f6fb6bdccb3e98f3152e3fbea724f1aeae7d749412bbb1fa9c441d474eba26"}, - {file = "triton-2.1.0-0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:21544e522c02005a626c8ad63d39bdff2f31d41069592919ef281e964ed26446"}, - {file = "triton-2.1.0-0-pp37-pypy37_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:143582ca31dd89cd982bd3bf53666bab1c7527d41e185f9e3d8a3051ce1b663b"}, - {file = "triton-2.1.0-0-pp38-pypy38_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:82fc5aeeedf6e36be4e4530cbdcba81a09d65c18e02f52dc298696d45721f3bd"}, - {file = "triton-2.1.0-0-pp39-pypy39_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:81a96d110a738ff63339fc892ded095b31bd0d205e3aace262af8400d40b6fa8"}, + {file = "triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2294514340cfe4e8f4f9e5c66c702744c4a117d25e618bd08469d0bfed1e2e5"}, + {file = "triton-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da58a152bddb62cafa9a857dd2bc1f886dbf9f9c90a2b5da82157cd2b34392b0"}, + {file = "triton-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0af58716e721460a61886668b205963dc4d1e4ac20508cc3f623aef0d70283d5"}, + {file = "triton-2.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8fe46d3ab94a8103e291bd44c741cc294b91d1d81c1a2888254cbf7ff846dab"}, + {file = "triton-2.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8ce26093e539d727e7cf6f6f0d932b1ab0574dc02567e684377630d86723ace"}, + {file = "triton-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:227cc6f357c5efcb357f3867ac2a8e7ecea2298cd4606a8ba1e931d1d5a947df"}, ] [package.dependencies] filelock = "*" [package.extras] -build = ["cmake (>=3.18)", "lit"] -tests = ["autopep8", "flake8", "isort", "numpy", "pytest", "scipy (>=1.7.1)"] -tutorials = ["matplotlib", "pandas", "tabulate"] +build = ["cmake (>=3.20)", "lit"] +tests = ["autopep8", "flake8", "isort", "numpy", "pytest", "scipy (>=1.7.1)", "torch"] +tutorials = ["matplotlib", "pandas", "tabulate", "torch"] [[package]] name = "typing-extensions" @@ -1742,17 +1770,18 @@ files = [ [[package]] name = "urllib3" -version = "2.1.0" +version = "2.2.0" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.8" files = [ - {file = "urllib3-2.1.0-py3-none-any.whl", hash = "sha256:55901e917a5896a349ff771be919f8bd99aff50b79fe58fec595eb37bbc56bb3"}, - {file = "urllib3-2.1.0.tar.gz", hash = "sha256:df7aa8afb0148fa78488e7899b2c59b5f4ffcfa82e6c54ccb9dd37c1d7b52d54"}, + {file = "urllib3-2.2.0-py3-none-any.whl", hash = "sha256:ce3711610ddce217e6d113a2732fafad960a03fd0318c91faa79481e35c11224"}, + {file = "urllib3-2.2.0.tar.gz", hash = "sha256:051d961ad0c62a94e50ecf1af379c3aba230c66c710493493560c0c223c49f20"}, ] [package.extras] brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] @@ -1764,4 +1793,4 @@ silicon = ["onnxruntime-silicon"] [metadata] lock-version = "2.0" python-versions = ">=3.9" -content-hash = "3939b79300af24dbd367bef1fde0327eaadd6051bf12db2f067c43305102f4a1" +content-hash = "5880967d7da3b018ae3a7d221cf038c3c01e75e59c7cad081892895e5f189624" diff --git a/pyproject.toml b/pyproject.toml index 9aa33a9..a1540d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "audio-separator" -version = "0.13.1" +version = "0.14.0" description = "Easy to use vocal separation, using MDX-Net models from UVR trained by @Anjok07" authors = ["Andrew Beveridge "] license = "MIT" diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index f28ceee..5ecf240 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -15,15 +15,13 @@ def common_expected_args(): "model_file_dir": "/tmp/audio-separator-models/", "output_dir": None, "output_format": "FLAC", - "denoise_enabled": False, + "enable_denoise": False, "normalization_threshold": 0.9, "output_single_stem": None, "invert_using_spec": False, "sample_rate": 44100, - "hop_length": 1024, - "segment_size": 256, - "overlap": 0.25, - "batch_size": 1, + "mdx_params": {"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1}, + "vr_params": {"batch_size": 4, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False}, } @@ -83,8 +81,8 @@ def test_cli_invalid_log_level(): # Test using model name argument -def test_cli_model_name_argument(common_expected_args): - test_args = ["cli.py", "test_audio.mp3", "--model_name=Custom_Model"] +def test_cli_model_filename_argument(common_expected_args): + test_args = ["cli.py", "test_audio.mp3", "--model_filename=Custom_Model.onnx"] with patch("sys.argv", test_args): with patch("audio_separator.separator.Separator") as mock_separator: mock_separator_instance = mock_separator.return_value @@ -93,7 +91,7 @@ def test_cli_model_name_argument(common_expected_args): # Assertions mock_separator.assert_called_once_with(**common_expected_args) - mock_separator_instance.load_model.assert_called_once_with("Custom_Model") + mock_separator_instance.load_model.assert_called_once_with("Custom_Model.onnx") # Test using output directory argument @@ -138,7 +136,7 @@ def test_cli_denoise_argument(common_expected_args): main() # Update expected args for this specific test - common_expected_args["denoise_enabled"] = True + common_expected_args["enable_denoise"] = True # Assertions mock_separator.assert_called_once_with(**common_expected_args) diff --git a/tests/unit/test_spec_utils.py b/tests/unit/test_spec_utils.py deleted file mode 100644 index 4996f5b..0000000 --- a/tests/unit/test_spec_utils.py +++ /dev/null @@ -1,41 +0,0 @@ -import unittest -import numpy as np -from audio_separator.separator.spec_utils import crop_center, preprocess, make_padding, wave_to_spectrogram, wave_to_spectrogram_mt - - -class TestSpecUtils(unittest.TestCase): - def test_preprocess(self): - X_spec = np.random.rand(10, 10) + 1j * np.random.rand(10, 10) - X_mag, X_phase = preprocess(X_spec) - self.assertEqual(X_mag.shape, X_spec.shape) - self.assertEqual(X_phase.shape, X_spec.shape) - - def test_make_padding(self): - width, cropsize, offset = 100, 50, 10 - left, right, roi_size = make_padding(width, cropsize, offset) - self.assertEqual(left, 10) - self.assertTrue(right >= left) - self.assertEqual(roi_size, 30) - - def test_preprocess_values(self): - X_spec = np.random.rand(10, 10) + 1j * np.random.rand(10, 10) - X_mag, X_phase = preprocess(X_spec) - self.assertTrue((X_mag >= 0).all()) - self.assertTrue((X_phase >= -np.pi).all() and (X_phase <= np.pi).all()) - - def test_make_padding_values(self): - width, cropsize, offset = 100, 50, 10 - left, right, roi_size = make_padding(width, cropsize, offset) - self.assertTrue(left >= 0) - self.assertTrue(right >= 0) - self.assertTrue(roi_size > 0) - - def test_preprocess_magnitude_phase(self): - X_spec = np.random.rand(5, 5) + 1j * np.random.rand(5, 5) - X_mag, X_phase = preprocess(X_spec) - self.assertTrue(np.all(X_mag >= 0)) - self.assertTrue(np.all(X_phase >= -np.pi) and np.all(X_phase <= np.pi)) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/unit/test_stft.py b/tests/unit/test_stft.py index 49ebe92..7ac92e5 100644 --- a/tests/unit/test_stft.py +++ b/tests/unit/test_stft.py @@ -2,7 +2,7 @@ import numpy as np import torch from unittest.mock import Mock, patch -from audio_separator.separator.stft import STFT +from audio_separator.separator.uvr_lib_v5.stft import STFT # Short-Time Fourier Transform (STFT) Process Overview: #