From 3816645691f9705f495f0e1aa96a88afbe063be0 Mon Sep 17 00:00:00 2001 From: Andrew Beveridge Date: Wed, 21 Feb 2024 02:13:28 -0500 Subject: [PATCH] Added initial WIP implementation of demucs, works for all stems initially, still a lot of cleanup and configuration to do --- .../architectures/demucs_separator.py | 241 ++++- .../separator/architectures/mdx_separator.py | 41 +- audio_separator/separator/common_separator.py | 58 +- audio_separator/separator/separator.py | 32 +- .../separator/uvr_lib_v5/demucs/__init__.py | 5 + .../separator/uvr_lib_v5/demucs/__main__.py | 272 ++++++ .../separator/uvr_lib_v5/demucs/apply.py | 296 ++++++ .../separator/uvr_lib_v5/demucs/demucs.py | 459 ++++++++++ .../separator/uvr_lib_v5/demucs/filtering.py | 502 +++++++++++ .../separator/uvr_lib_v5/demucs/hdemucs.py | 796 +++++++++++++++++ .../separator/uvr_lib_v5/demucs/htdemucs.py | 664 ++++++++++++++ .../separator/uvr_lib_v5/demucs/model.py | 218 +++++ .../separator/uvr_lib_v5/demucs/model_v2.py | 218 +++++ .../separator/uvr_lib_v5/demucs/pretrained.py | 180 ++++ .../separator/uvr_lib_v5/demucs/repo.py | 148 +++ .../separator/uvr_lib_v5/demucs/spec.py | 53 ++ .../separator/uvr_lib_v5/demucs/states.py | 148 +++ .../separator/uvr_lib_v5/demucs/tasnet.py | 447 ++++++++++ .../separator/uvr_lib_v5/demucs/tasnet_v2.py | 452 ++++++++++ .../uvr_lib_v5/demucs/transformer.py | 839 ++++++++++++++++++ .../separator/uvr_lib_v5/demucs/utils.py | 502 +++++++++++ poetry.lock | 225 ++++- pyproject.toml | 4 + 23 files changed, 6725 insertions(+), 75 deletions(-) create mode 100644 audio_separator/separator/uvr_lib_v5/demucs/__init__.py create mode 100644 audio_separator/separator/uvr_lib_v5/demucs/__main__.py create mode 100644 audio_separator/separator/uvr_lib_v5/demucs/apply.py create mode 100644 audio_separator/separator/uvr_lib_v5/demucs/demucs.py create mode 100644 audio_separator/separator/uvr_lib_v5/demucs/filtering.py create mode 100644 audio_separator/separator/uvr_lib_v5/demucs/hdemucs.py create mode 100644 audio_separator/separator/uvr_lib_v5/demucs/htdemucs.py create mode 100644 audio_separator/separator/uvr_lib_v5/demucs/model.py create mode 100644 audio_separator/separator/uvr_lib_v5/demucs/model_v2.py create mode 100644 audio_separator/separator/uvr_lib_v5/demucs/pretrained.py create mode 100644 audio_separator/separator/uvr_lib_v5/demucs/repo.py create mode 100644 audio_separator/separator/uvr_lib_v5/demucs/spec.py create mode 100644 audio_separator/separator/uvr_lib_v5/demucs/states.py create mode 100644 audio_separator/separator/uvr_lib_v5/demucs/tasnet.py create mode 100644 audio_separator/separator/uvr_lib_v5/demucs/tasnet_v2.py create mode 100644 audio_separator/separator/uvr_lib_v5/demucs/transformer.py create mode 100644 audio_separator/separator/uvr_lib_v5/demucs/utils.py diff --git a/audio_separator/separator/architectures/demucs_separator.py b/audio_separator/separator/architectures/demucs_separator.py index e6d913a..af1f060 100644 --- a/audio_separator/separator/architectures/demucs_separator.py +++ b/audio_separator/separator/architectures/demucs_separator.py @@ -1,5 +1,31 @@ +import os import torch +import numpy as np +from pathlib import Path from audio_separator.separator.common_separator import CommonSeparator +from audio_separator.separator.uvr_lib_v5.demucs.apply import apply_model, demucs_segments +from audio_separator.separator.uvr_lib_v5.demucs.hdemucs import HDemucs +from audio_separator.separator.uvr_lib_v5.demucs.pretrained import get_model as get_demucs_model +from audio_separator.separator.uvr_lib_v5 import spec_utils + +DEMUCS_2_SOURCE = ["instrumental", "vocals"] +DEMUCS_4_SOURCE = ["drums", "bass", "other", "vocals"] + +DEMUCS_2_SOURCE_MAPPER = {CommonSeparator.INST_STEM: 0, CommonSeparator.VOCAL_STEM: 1} + +DEMUCS_4_SOURCE_MAPPER = {CommonSeparator.BASS_STEM: 0, CommonSeparator.DRUM_STEM: 1, CommonSeparator.OTHER_STEM: 2, CommonSeparator.VOCAL_STEM: 3} + +DEMUCS_6_SOURCE_MAPPER = { + CommonSeparator.BASS_STEM: 0, + CommonSeparator.DRUM_STEM: 1, + CommonSeparator.OTHER_STEM: 2, + CommonSeparator.VOCAL_STEM: 3, + CommonSeparator.GUITAR_STEM: 4, + CommonSeparator.PIANO_STEM: 5, +} + +DEMUCS_4_SOURCE_LIST = [CommonSeparator.BASS_STEM, CommonSeparator.DRUM_STEM, CommonSeparator.OTHER_STEM, CommonSeparator.VOCAL_STEM] +DEMUCS_6_SOURCE_LIST = [CommonSeparator.BASS_STEM, CommonSeparator.DRUM_STEM, CommonSeparator.OTHER_STEM, CommonSeparator.VOCAL_STEM, CommonSeparator.GUITAR_STEM, CommonSeparator.PIANO_STEM] class DemucsSeparator(CommonSeparator): @@ -8,22 +34,207 @@ def __init__(self, common_config, arch_config): # e.g. user-specified functionality choices (self.output_single_stem) or common model parameters (self.primary_stem_name) super().__init__(config=common_config) - # Model data is basic overview metadata about the model, e.g. which stem is primary and whether it's a karaoke model - # It's loaded in from model_data_new.json in Separator.load_model and there are JSON examples in that method - # The instance variable self.model_data is passed through from Separator and set in CommonSeparator - self.logger.debug(f"Model data: {self.model_data}") + # Initializing user-configurable parameters, passed through with an mdx_from the CLI or Separator instance + + # 'Select a stem for extraction with the chosen model:\n\n' + # '• All Stems - Extracts all available stems.\n' + # '• Vocals - Only the "vocals" stem.\n' + # '• Other - Only the "other" stem.\n' + # '• Bass - Only the "bass" stem.\n' + # '• Drums - Only the "drums" stem.' + self.selected_stem = arch_config.get("selected_stem", [CommonSeparator.ALL_STEMS]) - # Initialize Demucs-specific parameters - self.model_path = arch_config["model_path"] - self.load_model() + # Adjust segments to manage RAM or V-RAM usage: + # - Smaller sizes consume less resources. + # - Bigger sizes consume more resources, but may provide better results. + # - "Default" picks the optimal size. + # DEMUCS_SEGMENTS = (DEF_OPT, '1', '5', '10', '15', '20', + # '25', '30', '35', '40', '45', '50', + # '55', '60', '65', '70', '75', '80', + # '85', '90', '95', '100') + self.segment_size = arch_config.get("segment_size", "Default") - def load_model(self): - # Load the Demucs model for inference - # This is a placeholder; actual implementation will depend on the model specifics - self.model = torch.load(self.model_path) - self.model.eval() + # Performs multiple predictions with random shifts of the input and averages them. + # The higher number of shifts, the longer the prediction will take. + # Not recommended unless you have a GPU. + # DEMUCS_SHIFTS = (0, 1, 2, 3, 4, 5, + # 6, 7, 8, 9, 10, 11, + # 12, 13, 14, 15, 16, 17, + # 18, 19, 20) + self.shifts = arch_config.get("shifts", 2) + + # This option controls the amount of overlap between prediction windows. + # - Higher values can provide better results, but will lead to longer processing times. + # - You can choose between 0.001-0.999 + # DEMUCS_OVERLAP = (0.25, 0.50, 0.75, 0.99) + self.overlap = arch_config.get("overlap", 0.25) + + # Enables "Segments". Deselecting this option is only recommended for those with powerful PCs. + self.segments_enabled = arch_config.get("segments_enabled", 2) + + self.demucs_source_list, self.demucs_source_map, self.demucs_stem_count = DEMUCS_4_SOURCE, DEMUCS_4_SOURCE_MAPPER, 4 + + self.primary_stem = CommonSeparator.PRIMARY_STEM if self.selected_stem == CommonSeparator.ALL_STEMS else self.selected_stem + self.secondary_stem = CommonSeparator.SECONDARY_STEM + + self.audio_file_path = None + self.audio_file_base = None + self.demucs_model_instance = None def separate(self, audio_file_path): - # Implement the separation logic using the Demucs model - # This is a placeholder; actual implementation will depend on the model specifics - pass + self.logger.debug("SeperateDemucs: Starting separation process...") + source = None + stem_source = None + inst_source = {} + + self.audio_file_path = audio_file_path + self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0] + + # Prepare the mix for processing + self.logger.debug("Preparing mix...") + mix = self.prepare_mix(self.audio_file_path) + + self.logger.debug(f"SeperateDemucs: Mix prepared for demixing. Shape: {mix.shape}") + + self.logger.debug("SeperateDemucs: Loading model for demixing...") + + self.demucs_model_instance = HDemucs(sources=self.demucs_source_list) + self.demucs_model_instance = get_demucs_model(name=os.path.splitext(os.path.basename(self.model_path))[0], repo=Path(os.path.dirname(self.model_path))) + self.demucs_model_instance = demucs_segments(self.segment_size, self.demucs_model_instance) + self.demucs_model_instance.to(self.torch_device) + self.demucs_model_instance.eval() + + self.logger.debug("SeperateDemucs: Model loaded and set to evaluation mode.") + + source = self.demix_demucs(mix) + + del self.demucs_model_instance + self.clear_gpu_cache() + self.logger.debug("SeperateDemucs: Model and GPU cache cleared after demixing.") + + if isinstance(inst_source, np.ndarray): + self.logger.debug("SeperateDemucs: Processing instance source...") + source_reshape = spec_utils.reshape_sources(inst_source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]], source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]]) + inst_source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]] = source_reshape + source = inst_source + + if isinstance(source, np.ndarray): + self.logger.debug(f"SeperateDemucs: Processing source array, source length is {len(source)}") + if len(source) == 2: + self.logger.debug("SeperateDemucs: Setting source map to 2-stem...") + self.demucs_source_map = DEMUCS_2_SOURCE_MAPPER + else: + self.logger.debug("SeperateDemucs: Setting source map to 4 or 6-stem...") + self.demucs_source_map = DEMUCS_6_SOURCE_MAPPER if len(source) == 6 else DEMUCS_4_SOURCE_MAPPER + + # if self.selected_stem == CommonSeparator.ALL_STEMS: + + self.logger.debug("SeperateDemucs: Processing for all stems...") + for stem_name, stem_value in self.demucs_source_map.items(): + stem_path = os.path.join(f"{self.audio_file_base}_({stem_name})_{self.model_name}.{self.output_format.lower()}") + stem_source = source[stem_value].T + + self.final_process(stem_path, stem_source, stem_name) + + # else: + # def secondary_save(sec_stem_name, source, raw_mixture=None, is_inst_mixture=False): + # self.logger.debug(f"SeperateDemucs: Saving secondary stem: {sec_stem_name}") + # secondary_source = self.secondary_source if not is_inst_mixture else None + # secondary_stem_path = os.path.join(self.export_path, f"{self.audio_file_base}_({sec_stem_name}).wav") + # secondary_source_secondary = None + + # if not isinstance(secondary_source, np.ndarray): + # if self.is_demucs_combine_stems: + # source = list(source) + # if is_inst_mixture: + # source = [i for n, i in enumerate(source) if not n in [self.demucs_source_map[self.primary_stem], self.demucs_source_map[CommonSeparator.VOCAL_STEM]]] + # else: + # source.pop(self.demucs_source_map[self.primary_stem]) + + # source = source[: len(source) - 2] if is_no_piano_guitar else source + # secondary_source = np.zeros_like(source[0]) + # for i in source: + # secondary_source += i + # secondary_source = secondary_source.T + # else: + # if not isinstance(raw_mixture, np.ndarray): + # raw_mixture = self.prepare_mix(self.audio_file) + + # secondary_source = source[self.demucs_source_map[self.primary_stem]] + + # if self.is_invert_spec: + # secondary_source = spec_utils.invert_stem(raw_mixture, secondary_source) + # else: + # raw_mixture = spec_utils.reshape_sources(secondary_source, raw_mixture) + # secondary_source = -secondary_source.T + raw_mixture.T + + # if not is_inst_mixture: + # self.secondary_source = secondary_source + # secondary_source_secondary = self.secondary_source_secondary + # self.secondary_source = self.process_secondary_stem(secondary_source, secondary_source_secondary) + # self.secondary_source_map = {self.secondary_stem: self.secondary_source} + + # self.write_audio(secondary_stem_path, secondary_source, samplerate, stem_name=sec_stem_name) + + # secondary_save(self.secondary_stem, source, raw_mixture=mix) + + # if self.is_demucs_pre_proc_model_inst_mix and self.pre_proc_model and not self.is_4_stem_ensemble: + # secondary_save(f"{self.secondary_stem} {CommonSeparator.INST_STEM}", source, raw_mixture=inst_mix, is_inst_mixture=True) + + # if not self.is_secondary_stem_only: + # primary_stem_path = os.path.join(self.export_path, f"{self.audio_file_base}_({self.primary_stem}).wav") + # if not isinstance(self.primary_source, np.ndarray): + # self.primary_source = source[self.demucs_source_map[self.primary_stem]].T + + # self.primary_source_map = self.final_process(primary_stem_path, self.primary_source, self.secondary_source_primary, self.primary_stem, samplerate) + + # secondary_sources = {**self.primary_source_map, **self.secondary_source_map} + + # self.process_vocal_split_chain(secondary_sources) + + # if self.is_secondary_model: + # return secondary_sources + + def demix_demucs(self, mix): + """ + Demixes the input mix using the demucs model. + """ + self.logger.debug("SeperateDemucs: Starting demixing process in demix_demucs...") + org_mix = mix + + # if self.is_pitch_change: + # self.logger.debug("SeperateDemucs: Applying pitch change...") + # mix, sr_pitched = spec_utils.change_pitch_semitones(mix, 44100, semitone_shift=-self.semitone_shift) + + processed = {} + mix = torch.tensor(mix, dtype=torch.float32) + ref = mix.mean(0) + mix = (mix - ref.mean()) / ref.std() + mix_infer = mix + + with torch.no_grad(): + self.logger.debug("SeperateDemucs: Running model inference...") + sources = apply_model( + model=self.demucs_model_instance, + mix=mix_infer[None], + shifts=self.shifts, + split=self.segments_enabled, + overlap=self.overlap, + static_shifts=1 if self.shifts == 0 else self.shifts, + set_progress_bar=None, + device=self.torch_device, + )[0] + + sources = (sources * ref.std() + ref.mean()).cpu().numpy() + sources[[0, 1]] = sources[[1, 0]] + processed[mix] = sources[:, :, 0:None].copy() + sources = list(processed.values()) + sources = [s[:, :, 0:None] for s in sources] + # sources = [self.pitch_fix(s[:,:,0:None], sr_pitched, org_mix) if self.is_pitch_change else s[:,:,0:None] for s in sources] + sources = np.concatenate(sources, axis=-1) + + # if self.is_pitch_change: + # self.logger.debug("SeperateDemucs: Fixing pitch post-demixing...") + # sources = np.stack([self.pitch_fix(stem, sr_pitched, org_mix) for stem in sources]) + + return sources diff --git a/audio_separator/separator/architectures/mdx_separator.py b/audio_separator/separator/architectures/mdx_separator.py index 17cdcba..cd0434d 100644 --- a/audio_separator/separator/architectures/mdx_separator.py +++ b/audio_separator/separator/architectures/mdx_separator.py @@ -2,7 +2,6 @@ import os import torch -import librosa import onnxruntime as ort import numpy as np import onnx2torch @@ -19,6 +18,8 @@ class MDXSeparator(CommonSeparator): """ def __init__(self, common_config, arch_config): + # Any configuration values which can be shared between architectures should be set already in CommonSeparator, + # e.g. user-specified functionality choices (self.output_single_stem) or common model parameters (self.primary_stem_name) super().__init__(config=common_config) # Initializing user-configurable parameters, passed through with an mdx_from the CLI or Separator instance @@ -434,41 +435,3 @@ def run_model(self, mix, is_match_mix=False): self.logger.debug(f"Inverse STFT applied. Returning result with shape: {result.shape}") return result - - def prepare_mix(self, mix): - """ - Prepares the mix for processing. This includes loading the audio from a file if necessary, - ensuring the mix is in the correct format, and converting mono to stereo if needed. - """ - # Store the original path or the mix itself for later checks - audio_path = mix - - # Check if the input is a file path (string) and needs to be loaded - if not isinstance(mix, np.ndarray): - self.logger.debug(f"Loading audio from file: {mix}") - mix, sr = librosa.load(mix, mono=False, sr=self.sample_rate) - self.logger.debug(f"Audio loaded. Sample rate: {sr}, Audio shape: {mix.shape}") - else: - # Transpose the mix if it's already an ndarray (expected shape: [channels, samples]) - self.logger.debug("Transposing the provided mix array.") - mix = mix.T - self.logger.debug(f"Transposed mix shape: {mix.shape}") - - # If the original input was a filepath, check if the loaded mix is empty - if isinstance(audio_path, str): - if not np.any(mix): - error_msg = f"Audio file {audio_path} is empty or not valid" - self.logger.error(error_msg) - raise ValueError(error_msg) - else: - self.logger.debug("Audio file is valid and contains data.") - - # Ensure the mix is in stereo format - if mix.ndim == 1: - self.logger.debug("Mix is mono. Converting to stereo.") - mix = np.asfortranarray([mix, mix]) - self.logger.debug("Converted to stereo mix.") - - # Final log indicating successful preparation of the mix - self.logger.debug("Mix preparation completed.") - return mix diff --git a/audio_separator/separator/common_separator.py b/audio_separator/separator/common_separator.py index 4954c30..a691dd1 100644 --- a/audio_separator/separator/common_separator.py +++ b/audio_separator/separator/common_separator.py @@ -2,7 +2,10 @@ from logging import Logger import os +import gc import numpy as np +import librosa +import torch from pydub import AudioSegment from audio_separator.separator.uvr_lib_v5 import spec_utils @@ -80,7 +83,7 @@ def __init__(self, config): self.sample_rate = config.get("sample_rate") # Model specific properties - self.primary_stem_name = self.model_data["primary_stem"] + self.primary_stem_name = self.model_data.get("primary_stem", "Vocals") self.secondary_stem_name = "Vocals" if self.primary_stem_name == "Instrumental" else "Instrumental" self.is_karaoke = self.model_data.get("is_karaoke", False) self.is_bv_model = self.model_data.get("is_bv_model", False) @@ -94,7 +97,7 @@ def __init__(self, config): self.logger.debug(f"Common params: invert_using_spec={self.invert_using_spec}, sample_rate={self.sample_rate}") self.logger.debug(f"Common params: primary_stem_name={self.primary_stem_name}, secondary_stem_name={self.secondary_stem_name}") - self.logger.debug(f"Common params: is_karaoke={self.is_karaoke}, is_bv_model={self.is_bv_model}, bv_model_rebalance={self.bv_model_rebalance}") + self.logger.debug(f"Common params: is_karaoke={self.is_karaoke}, is_bv_model={self.is_bv_model}, bv_model_rebalance={self.bv_model_rebalance}") self.cached_sources_map = {} @@ -156,6 +159,44 @@ def cached_model_source_holder(self, model_architecture, sources, model_name=Non """ self.cached_sources_map[model_architecture] = {**self.cached_sources_map.get(model_architecture, {}), **{model_name: sources}} + def prepare_mix(self, mix): + """ + Prepares the mix for processing. This includes loading the audio from a file if necessary, + ensuring the mix is in the correct format, and converting mono to stereo if needed. + """ + # Store the original path or the mix itself for later checks + audio_path = mix + + # Check if the input is a file path (string) and needs to be loaded + if not isinstance(mix, np.ndarray): + self.logger.debug(f"Loading audio from file: {mix}") + mix, sr = librosa.load(mix, mono=False, sr=self.sample_rate) + self.logger.debug(f"Audio loaded. Sample rate: {sr}, Audio shape: {mix.shape}") + else: + # Transpose the mix if it's already an ndarray (expected shape: [channels, samples]) + self.logger.debug("Transposing the provided mix array.") + mix = mix.T + self.logger.debug(f"Transposed mix shape: {mix.shape}") + + # If the original input was a filepath, check if the loaded mix is empty + if isinstance(audio_path, str): + if not np.any(mix): + error_msg = f"Audio file {audio_path} is empty or not valid" + self.logger.error(error_msg) + raise ValueError(error_msg) + else: + self.logger.debug("Audio file is valid and contains data.") + + # Ensure the mix is in stereo format + if mix.ndim == 1: + self.logger.debug("Mix is mono. Converting to stereo.") + mix = np.asfortranarray([mix, mix]) + self.logger.debug("Converted to stereo mix.") + + # Final log indicating successful preparation of the mix + self.logger.debug("Mix preparation completed.") + return mix + def write_audio(self, stem_path: str, stem_source): """ Writes the separated audio source to a file. @@ -212,3 +253,16 @@ def write_audio(self, stem_path: str, stem_source): self.logger.debug(f"Exported audio file successfully to {stem_path}") except (IOError, ValueError) as e: self.logger.error(f"Error exporting audio file: {e}") + + def clear_gpu_cache(self): + """ + This method clears the GPU cache to free up memory. + """ + self.logger.debug("Running garbage collection...") + gc.collect() + if self.torch_device == torch.device("mps"): + self.logger.debug("Clearing MPS cache...") + torch.mps.empty_cache() + if self.torch_device == torch.device("cuda"): + self.logger.debug("Clearing CUDA cache...") + torch.cuda.empty_cache() diff --git a/audio_separator/separator/separator.py b/audio_separator/separator/separator.py index 2bfb3bb..2142bf3 100644 --- a/audio_separator/separator/separator.py +++ b/audio_separator/separator/separator.py @@ -2,7 +2,6 @@ from importlib import metadata import os -import gc import platform import subprocess @@ -15,7 +14,6 @@ import requests import torch import onnxruntime as ort -from audio_separator.separator.architectures import MDXSeparator, VRSeparator, DemucsSeparator class Separator: @@ -300,19 +298,6 @@ def download_file_if_not_exists(self, url, output_path): else: raise RuntimeError(f"Failed to download file from {url}, response code: {response.status_code}") - def clear_gpu_cache(self): - """ - This method clears the GPU cache to free up memory. - """ - self.logger.debug("Running garbage collection...") - gc.collect() - if self.torch_device == torch.device("mps"): - self.logger.debug("Clearing MPS cache...") - torch.mps.empty_cache() - if self.torch_device == torch.device("cuda"): - self.logger.debug("Clearing CUDA cache...") - torch.cuda.empty_cache() - def list_supported_model_files(self): """ This method lists the supported model files for audio-separator, by fetching the same file UVR uses to list these. @@ -364,11 +349,14 @@ def list_supported_model_files(self): # } # } + # Only show Demucs v4 models as we've only implemented support for v4 + filtered_demucs_v4 = {key: value for key, value in model_downloads_list["demucs_download_list"].items() if key.startswith("Demucs v4")} + # Return object with list of model names, which are the keys in vr_download_list, mdx_download_list, demucs_download_list, mdx23_download_list, mdx23c_download_list, grouped by type: VR, MDX, Demucs, MDX23, MDX23C model_files_grouped_by_type = { "VR": model_downloads_list["vr_download_list"], "MDX": model_downloads_list["mdx_download_list"], - "Demucs": model_downloads_list["demucs_download_list"], + "Demucs": filtered_demucs_v4, "MDX23": model_downloads_list["mdx23_download_list"], "MDX23C": model_downloads_list["mdx23c_download_list"], } @@ -606,7 +594,15 @@ def load_model(self, model_filename="UVR-MDX-NET-Inst_HQ_3.onnx"): if model_type not in self.arch_specific_params: raise ValueError(f"Model type not supported (yet): {model_type}") - self.model_instance = MDXSeparator(common_config=common_params, arch_config=self.arch_specific_params[model_type]) + # Instantiate the appropriate separator class depending on the model type + separator_classes = {"MDX": "MDXSeparator", "VR": "VRSeparator", "Demucs": "DemucsSeparator"} + + if model_type not in separator_classes: + raise ValueError(f"Model type not supported (yet): {model_type}") + + module = __import__("audio_separator.separator.architectures", fromlist=[separator_classes[model_type]]) + separator_class = getattr(module, separator_classes[model_type]) + self.model_instance = separator_class(common_config=common_params, arch_config=self.arch_specific_params[model_type]) # Log the completion of the model load process self.logger.debug("Loading model completed.") @@ -641,7 +637,7 @@ def separate(self, audio_file_path): output_files = self.model_instance.separate(audio_file_path) # Clear GPU cache to free up memory - self.clear_gpu_cache() + self.model_instance.clear_gpu_cache() # Unset the audio file to prevent accidental re-separation of the same file self.logger.debug("Clearing audio file...") diff --git a/audio_separator/separator/uvr_lib_v5/demucs/__init__.py b/audio_separator/separator/uvr_lib_v5/demucs/__init__.py new file mode 100644 index 0000000..5656d59 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/demucs/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. diff --git a/audio_separator/separator/uvr_lib_v5/demucs/__main__.py b/audio_separator/separator/uvr_lib_v5/demucs/__main__.py new file mode 100644 index 0000000..5de878f --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/demucs/__main__.py @@ -0,0 +1,272 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import json +import os +import sys +import time +from dataclasses import dataclass, field +from fractions import Fraction + +import torch as th +from torch import distributed, nn +from torch.nn.parallel.distributed import DistributedDataParallel + +from .augment import FlipChannels, FlipSign, Remix, Shift +from .compressed import StemsSet, build_musdb_metadata, get_musdb_tracks +from .model import Demucs +from .parser import get_name, get_parser +from .raw import Rawset +from .tasnet import ConvTasNet +from .test import evaluate +from .train import train_model, validate_model +from .utils import human_seconds, load_model, save_model, sizeof_fmt + + +@dataclass +class SavedState: + metrics: list = field(default_factory=list) + last_state: dict = None + best_state: dict = None + optimizer: dict = None + + +def main(): + parser = get_parser() + args = parser.parse_args() + name = get_name(parser, args) + print(f"Experiment {name}") + + if args.musdb is None and args.rank == 0: + print( + "You must provide the path to the MusDB dataset with the --musdb flag. " + "To download the MusDB dataset, see https://sigsep.github.io/datasets/musdb.html.", + file=sys.stderr) + sys.exit(1) + + eval_folder = args.evals / name + eval_folder.mkdir(exist_ok=True, parents=True) + args.logs.mkdir(exist_ok=True) + metrics_path = args.logs / f"{name}.json" + eval_folder.mkdir(exist_ok=True, parents=True) + args.checkpoints.mkdir(exist_ok=True, parents=True) + args.models.mkdir(exist_ok=True, parents=True) + + if args.device is None: + device = "cpu" + if th.cuda.is_available(): + device = "cuda" + else: + device = args.device + + th.manual_seed(args.seed) + # Prevents too many threads to be started when running `museval` as it can be quite + # inefficient on NUMA architectures. + os.environ["OMP_NUM_THREADS"] = "1" + + if args.world_size > 1: + if device != "cuda" and args.rank == 0: + print("Error: distributed training is only available with cuda device", file=sys.stderr) + sys.exit(1) + th.cuda.set_device(args.rank % th.cuda.device_count()) + distributed.init_process_group(backend="nccl", + init_method="tcp://" + args.master, + rank=args.rank, + world_size=args.world_size) + + checkpoint = args.checkpoints / f"{name}.th" + checkpoint_tmp = args.checkpoints / f"{name}.th.tmp" + if args.restart and checkpoint.exists(): + checkpoint.unlink() + + if args.test: + args.epochs = 1 + args.repeat = 0 + model = load_model(args.models / args.test) + elif args.tasnet: + model = ConvTasNet(audio_channels=args.audio_channels, samplerate=args.samplerate, X=args.X) + else: + model = Demucs( + audio_channels=args.audio_channels, + channels=args.channels, + context=args.context, + depth=args.depth, + glu=args.glu, + growth=args.growth, + kernel_size=args.kernel_size, + lstm_layers=args.lstm_layers, + rescale=args.rescale, + rewrite=args.rewrite, + sources=4, + stride=args.conv_stride, + upsample=args.upsample, + samplerate=args.samplerate + ) + model.to(device) + if args.show: + print(model) + size = sizeof_fmt(4 * sum(p.numel() for p in model.parameters())) + print(f"Model size {size}") + return + + optimizer = th.optim.Adam(model.parameters(), lr=args.lr) + + try: + saved = th.load(checkpoint, map_location='cpu') + except IOError: + saved = SavedState() + else: + model.load_state_dict(saved.last_state) + optimizer.load_state_dict(saved.optimizer) + + if args.save_model: + if args.rank == 0: + model.to("cpu") + model.load_state_dict(saved.best_state) + save_model(model, args.models / f"{name}.th") + return + + if args.rank == 0: + done = args.logs / f"{name}.done" + if done.exists(): + done.unlink() + + if args.augment: + augment = nn.Sequential(FlipSign(), FlipChannels(), Shift(args.data_stride), + Remix(group_size=args.remix_group_size)).to(device) + else: + augment = Shift(args.data_stride) + + if args.mse: + criterion = nn.MSELoss() + else: + criterion = nn.L1Loss() + + # Setting number of samples so that all convolution windows are full. + # Prevents hard to debug mistake with the prediction being shifted compared + # to the input mixture. + samples = model.valid_length(args.samples) + print(f"Number of training samples adjusted to {samples}") + + if args.raw: + train_set = Rawset(args.raw / "train", + samples=samples + args.data_stride, + channels=args.audio_channels, + streams=[0, 1, 2, 3, 4], + stride=args.data_stride) + + valid_set = Rawset(args.raw / "valid", channels=args.audio_channels) + else: + if not args.metadata.is_file() and args.rank == 0: + build_musdb_metadata(args.metadata, args.musdb, args.workers) + if args.world_size > 1: + distributed.barrier() + metadata = json.load(open(args.metadata)) + duration = Fraction(samples + args.data_stride, args.samplerate) + stride = Fraction(args.data_stride, args.samplerate) + train_set = StemsSet(get_musdb_tracks(args.musdb, subsets=["train"], split="train"), + metadata, + duration=duration, + stride=stride, + samplerate=args.samplerate, + channels=args.audio_channels) + valid_set = StemsSet(get_musdb_tracks(args.musdb, subsets=["train"], split="valid"), + metadata, + samplerate=args.samplerate, + channels=args.audio_channels) + + best_loss = float("inf") + for epoch, metrics in enumerate(saved.metrics): + print(f"Epoch {epoch:03d}: " + f"train={metrics['train']:.8f} " + f"valid={metrics['valid']:.8f} " + f"best={metrics['best']:.4f} " + f"duration={human_seconds(metrics['duration'])}") + best_loss = metrics['best'] + + if args.world_size > 1: + dmodel = DistributedDataParallel(model, + device_ids=[th.cuda.current_device()], + output_device=th.cuda.current_device()) + else: + dmodel = model + + for epoch in range(len(saved.metrics), args.epochs): + begin = time.time() + model.train() + train_loss = train_model(epoch, + train_set, + dmodel, + criterion, + optimizer, + augment, + batch_size=args.batch_size, + device=device, + repeat=args.repeat, + seed=args.seed, + workers=args.workers, + world_size=args.world_size) + model.eval() + valid_loss = validate_model(epoch, + valid_set, + model, + criterion, + device=device, + rank=args.rank, + split=args.split_valid, + world_size=args.world_size) + + duration = time.time() - begin + if valid_loss < best_loss: + best_loss = valid_loss + saved.best_state = { + key: value.to("cpu").clone() + for key, value in model.state_dict().items() + } + saved.metrics.append({ + "train": train_loss, + "valid": valid_loss, + "best": best_loss, + "duration": duration + }) + if args.rank == 0: + json.dump(saved.metrics, open(metrics_path, "w")) + + saved.last_state = model.state_dict() + saved.optimizer = optimizer.state_dict() + if args.rank == 0 and not args.test: + th.save(saved, checkpoint_tmp) + checkpoint_tmp.rename(checkpoint) + + print(f"Epoch {epoch:03d}: " + f"train={train_loss:.8f} valid={valid_loss:.8f} best={best_loss:.4f} " + f"duration={human_seconds(duration)}") + + del dmodel + model.load_state_dict(saved.best_state) + if args.eval_cpu: + device = "cpu" + model.to(device) + model.eval() + evaluate(model, + args.musdb, + eval_folder, + rank=args.rank, + world_size=args.world_size, + device=device, + save=args.save, + split=args.split_valid, + shifts=args.shifts, + workers=args.eval_workers) + model.to("cpu") + save_model(model, args.models / f"{name}.th") + if args.rank == 0: + print("done") + done.write_text("done") + + +if __name__ == "__main__": + main() diff --git a/audio_separator/separator/uvr_lib_v5/demucs/apply.py b/audio_separator/separator/uvr_lib_v5/demucs/apply.py new file mode 100644 index 0000000..5481f49 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/demucs/apply.py @@ -0,0 +1,296 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +""" +Code to apply a model to a mix. It will handle chunking with overlaps and +inteprolation between chunks, as well as the "shift trick". +""" +from concurrent.futures import ThreadPoolExecutor +import random +import typing as tp +from multiprocessing import Process, Queue, Pipe + +import torch as th +from torch import nn +from torch.nn import functional as F +import tqdm +import tkinter as tk + +from .demucs import Demucs +from .hdemucs import HDemucs +from .utils import center_trim, DummyPoolExecutor + +Model = tp.Union[Demucs, HDemucs] + +progress_bar_num = 0 + + +class BagOfModels(nn.Module): + def __init__(self, models: tp.List[Model], weights: tp.Optional[tp.List[tp.List[float]]] = None, segment: tp.Optional[float] = None): + """ + Represents a bag of models with specific weights. + You should call `apply_model` rather than calling directly the forward here for + optimal performance. + + Args: + models (list[nn.Module]): list of Demucs/HDemucs models. + weights (list[list[float]]): list of weights. If None, assumed to + be all ones, otherwise it should be a list of N list (N number of models), + each containing S floats (S number of sources). + segment (None or float): overrides the `segment` attribute of each model + (this is performed inplace, be careful if you reuse the models passed). + """ + + super().__init__() + assert len(models) > 0 + first = models[0] + for other in models: + assert other.sources == first.sources + assert other.samplerate == first.samplerate + assert other.audio_channels == first.audio_channels + if segment is not None: + other.segment = segment + + self.audio_channels = first.audio_channels + self.samplerate = first.samplerate + self.sources = first.sources + self.models = nn.ModuleList(models) + + if weights is None: + weights = [[1.0 for _ in first.sources] for _ in models] + else: + assert len(weights) == len(models) + for weight in weights: + assert len(weight) == len(first.sources) + self.weights = weights + + def forward(self, x): + raise NotImplementedError("Call `apply_model` on this.") + + +class TensorChunk: + def __init__(self, tensor, offset=0, length=None): + total_length = tensor.shape[-1] + assert offset >= 0 + assert offset < total_length + + if length is None: + length = total_length - offset + else: + length = min(total_length - offset, length) + + if isinstance(tensor, TensorChunk): + self.tensor = tensor.tensor + self.offset = offset + tensor.offset + else: + self.tensor = tensor + self.offset = offset + self.length = length + self.device = tensor.device + + @property + def shape(self): + shape = list(self.tensor.shape) + shape[-1] = self.length + return shape + + def padded(self, target_length): + delta = target_length - self.length + total_length = self.tensor.shape[-1] + assert delta >= 0 + + start = self.offset - delta // 2 + end = start + target_length + + correct_start = max(0, start) + correct_end = min(total_length, end) + + pad_left = correct_start - start + pad_right = end - correct_end + + out = F.pad(self.tensor[..., correct_start:correct_end], (pad_left, pad_right)) + assert out.shape[-1] == target_length + return out + + +def tensor_chunk(tensor_or_chunk): + if isinstance(tensor_or_chunk, TensorChunk): + return tensor_or_chunk + else: + assert isinstance(tensor_or_chunk, th.Tensor) + return TensorChunk(tensor_or_chunk) + + +def apply_model(model, mix, shifts=1, split=True, overlap=0.25, transition_power=1.0, static_shifts=1, set_progress_bar=None, device=None, progress=False, num_workers=0, pool=None): + """ + Apply model to a given mixture. + + Args: + shifts (int): if > 0, will shift in time `mix` by a random amount between 0 and 0.5 sec + and apply the oppositve shift to the output. This is repeated `shifts` time and + all predictions are averaged. This effectively makes the model time equivariant + and improves SDR by up to 0.2 points. + split (bool): if True, the input will be broken down in 8 seconds extracts + and predictions will be performed individually on each and concatenated. + Useful for model with large memory footprint like Tasnet. + progress (bool): if True, show a progress bar (requires split=True) + device (torch.device, str, or None): if provided, device on which to + execute the computation, otherwise `mix.device` is assumed. + When `device` is different from `mix.device`, only local computations will + be on `device`, while the entire tracks will be stored on `mix.device`. + """ + + global fut_length + global bag_num + global prog_bar + + if device is None: + device = mix.device + else: + device = th.device(device) + if pool is None: + if num_workers > 0 and device.type == "cpu": + pool = ThreadPoolExecutor(num_workers) + else: + pool = DummyPoolExecutor() + + kwargs = { + "shifts": shifts, + "split": split, + "overlap": overlap, + "transition_power": transition_power, + "progress": progress, + "device": device, + "pool": pool, + "set_progress_bar": set_progress_bar, + "static_shifts": static_shifts, + } + + if isinstance(model, BagOfModels): + # Special treatment for bag of model. + # We explicitely apply multiple times `apply_model` so that the random shifts + # are different for each model. + + estimates = 0 + totals = [0] * len(model.sources) + bag_num = len(model.models) + fut_length = 0 + prog_bar = 0 + current_model = 0 # (bag_num + 1) + for sub_model, weight in zip(model.models, model.weights): + original_model_device = next(iter(sub_model.parameters())).device + sub_model.to(device) + fut_length += fut_length + current_model += 1 + out = apply_model(sub_model, mix, **kwargs) + sub_model.to(original_model_device) + for k, inst_weight in enumerate(weight): + out[:, k, :, :] *= inst_weight + totals[k] += inst_weight + estimates += out + del out + + for k in range(estimates.shape[1]): + estimates[:, k, :, :] /= totals[k] + return estimates + + model.to(device) + model.eval() + assert transition_power >= 1, "transition_power < 1 leads to weird behavior." + batch, channels, length = mix.shape + + if shifts: + kwargs["shifts"] = 0 + max_shift = int(0.5 * model.samplerate) + mix = tensor_chunk(mix) + padded_mix = mix.padded(length + 2 * max_shift) + out = 0 + for _ in range(shifts): + offset = random.randint(0, max_shift) + shifted = TensorChunk(padded_mix, offset, length + max_shift - offset) + shifted_out = apply_model(model, shifted, **kwargs) + out += shifted_out[..., max_shift - offset :] + out /= shifts + return out + elif split: + kwargs["split"] = False + out = th.zeros(batch, len(model.sources), channels, length, device=mix.device) + sum_weight = th.zeros(length, device=mix.device) + segment = int(model.samplerate * model.segment) + stride = int((1 - overlap) * segment) + offsets = range(0, length, stride) + scale = float(format(stride / model.samplerate, ".2f")) + # We start from a triangle shaped weight, with maximal weight in the middle + # of the segment. Then we normalize and take to the power `transition_power`. + # Large values of transition power will lead to sharper transitions. + weight = th.cat([th.arange(1, segment // 2 + 1, device=device), th.arange(segment - segment // 2, 0, -1, device=device)]) + assert len(weight) == segment + # If the overlap < 50%, this will translate to linear transition when + # transition_power is 1. + weight = (weight / weight.max()) ** transition_power + futures = [] + for offset in offsets: + chunk = TensorChunk(mix, offset, segment) + future = pool.submit(apply_model, model, chunk, **kwargs) + futures.append((future, offset)) + offset += segment + if progress: + futures = tqdm.tqdm(futures, unit_scale=scale, ncols=120, unit="seconds") + for future, offset in futures: + if set_progress_bar: + fut_length = len(futures) * bag_num * static_shifts + prog_bar += 1 + set_progress_bar(0.1, (0.8 / fut_length * prog_bar)) + chunk_out = future.result() + chunk_length = chunk_out.shape[-1] + out[..., offset : offset + segment] += (weight[:chunk_length] * chunk_out).to(mix.device) + sum_weight[offset : offset + segment] += weight[:chunk_length].to(mix.device) + assert sum_weight.min() > 0 + out /= sum_weight + return out + else: + if hasattr(model, "valid_length"): + valid_length = model.valid_length(length) + else: + valid_length = length + mix = tensor_chunk(mix) + padded_mix = mix.padded(valid_length).to(device) + with th.no_grad(): + out = model(padded_mix) + return center_trim(out, length) + + +def demucs_segments(demucs_segment, demucs_model): + + if demucs_segment == "Default": + segment = None + if isinstance(demucs_model, BagOfModels): + if segment is not None: + for sub in demucs_model.models: + sub.segment = segment + else: + if segment is not None: + sub.segment = segment + else: + try: + segment = int(demucs_segment) + if isinstance(demucs_model, BagOfModels): + if segment is not None: + for sub in demucs_model.models: + sub.segment = segment + else: + if segment is not None: + sub.segment = segment + except: + segment = None + if isinstance(demucs_model, BagOfModels): + if segment is not None: + for sub in demucs_model.models: + sub.segment = segment + else: + if segment is not None: + sub.segment = segment + + return demucs_model diff --git a/audio_separator/separator/uvr_lib_v5/demucs/demucs.py b/audio_separator/separator/uvr_lib_v5/demucs/demucs.py new file mode 100644 index 0000000..d2c08e7 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/demucs/demucs.py @@ -0,0 +1,459 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math +import typing as tp + +import julius +import torch +from torch import nn +from torch.nn import functional as F + +from .states import capture_init +from .utils import center_trim, unfold + + +class BLSTM(nn.Module): + """ + BiLSTM with same hidden units as input dim. + If `max_steps` is not None, input will be splitting in overlapping + chunks and the LSTM applied separately on each chunk. + """ + def __init__(self, dim, layers=1, max_steps=None, skip=False): + super().__init__() + assert max_steps is None or max_steps % 4 == 0 + self.max_steps = max_steps + self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim) + self.linear = nn.Linear(2 * dim, dim) + self.skip = skip + + def forward(self, x): + B, C, T = x.shape + y = x + framed = False + if self.max_steps is not None and T > self.max_steps: + width = self.max_steps + stride = width // 2 + frames = unfold(x, width, stride) + nframes = frames.shape[2] + framed = True + x = frames.permute(0, 2, 1, 3).reshape(-1, C, width) + + x = x.permute(2, 0, 1) + + x = self.lstm(x)[0] + x = self.linear(x) + x = x.permute(1, 2, 0) + if framed: + out = [] + frames = x.reshape(B, -1, C, width) + limit = stride // 2 + for k in range(nframes): + if k == 0: + out.append(frames[:, k, :, :-limit]) + elif k == nframes - 1: + out.append(frames[:, k, :, limit:]) + else: + out.append(frames[:, k, :, limit:-limit]) + out = torch.cat(out, -1) + out = out[..., :T] + x = out + if self.skip: + x = x + y + return x + + +def rescale_conv(conv, reference): + """Rescale initial weight scale. It is unclear why it helps but it certainly does. + """ + std = conv.weight.std().detach() + scale = (std / reference)**0.5 + conv.weight.data /= scale + if conv.bias is not None: + conv.bias.data /= scale + + +def rescale_module(module, reference): + for sub in module.modules(): + if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d, nn.Conv2d, nn.ConvTranspose2d)): + rescale_conv(sub, reference) + + +class LayerScale(nn.Module): + """Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf). + This rescales diagonaly residual outputs close to 0 initially, then learnt. + """ + def __init__(self, channels: int, init: float = 0): + super().__init__() + self.scale = nn.Parameter(torch.zeros(channels, requires_grad=True)) + self.scale.data[:] = init + + def forward(self, x): + return self.scale[:, None] * x + + +class DConv(nn.Module): + """ + New residual branches in each encoder layer. + This alternates dilated convolutions, potentially with LSTMs and attention. + Also before entering each residual branch, dimension is projected on a smaller subspace, + e.g. of dim `channels // compress`. + """ + def __init__(self, channels: int, compress: float = 4, depth: int = 2, init: float = 1e-4, + norm=True, attn=False, heads=4, ndecay=4, lstm=False, gelu=True, + kernel=3, dilate=True): + """ + Args: + channels: input/output channels for residual branch. + compress: amount of channel compression inside the branch. + depth: number of layers in the residual branch. Each layer has its own + projection, and potentially LSTM and attention. + init: initial scale for LayerNorm. + norm: use GroupNorm. + attn: use LocalAttention. + heads: number of heads for the LocalAttention. + ndecay: number of decay controls in the LocalAttention. + lstm: use LSTM. + gelu: Use GELU activation. + kernel: kernel size for the (dilated) convolutions. + dilate: if true, use dilation, increasing with the depth. + """ + + super().__init__() + assert kernel % 2 == 1 + self.channels = channels + self.compress = compress + self.depth = abs(depth) + dilate = depth > 0 + + norm_fn: tp.Callable[[int], nn.Module] + norm_fn = lambda d: nn.Identity() # noqa + if norm: + norm_fn = lambda d: nn.GroupNorm(1, d) # noqa + + hidden = int(channels / compress) + + act: tp.Type[nn.Module] + if gelu: + act = nn.GELU + else: + act = nn.ReLU + + self.layers = nn.ModuleList([]) + for d in range(self.depth): + dilation = 2 ** d if dilate else 1 + padding = dilation * (kernel // 2) + mods = [ + nn.Conv1d(channels, hidden, kernel, dilation=dilation, padding=padding), + norm_fn(hidden), act(), + nn.Conv1d(hidden, 2 * channels, 1), + norm_fn(2 * channels), nn.GLU(1), + LayerScale(channels, init), + ] + if attn: + mods.insert(3, LocalState(hidden, heads=heads, ndecay=ndecay)) + if lstm: + mods.insert(3, BLSTM(hidden, layers=2, max_steps=200, skip=True)) + layer = nn.Sequential(*mods) + self.layers.append(layer) + + def forward(self, x): + for layer in self.layers: + x = x + layer(x) + return x + + +class LocalState(nn.Module): + """Local state allows to have attention based only on data (no positional embedding), + but while setting a constraint on the time window (e.g. decaying penalty term). + + Also a failed experiments with trying to provide some frequency based attention. + """ + def __init__(self, channels: int, heads: int = 4, nfreqs: int = 0, ndecay: int = 4): + super().__init__() + assert channels % heads == 0, (channels, heads) + self.heads = heads + self.nfreqs = nfreqs + self.ndecay = ndecay + self.content = nn.Conv1d(channels, channels, 1) + self.query = nn.Conv1d(channels, channels, 1) + self.key = nn.Conv1d(channels, channels, 1) + if nfreqs: + self.query_freqs = nn.Conv1d(channels, heads * nfreqs, 1) + if ndecay: + self.query_decay = nn.Conv1d(channels, heads * ndecay, 1) + # Initialize decay close to zero (there is a sigmoid), for maximum initial window. + self.query_decay.weight.data *= 0.01 + assert self.query_decay.bias is not None # stupid type checker + self.query_decay.bias.data[:] = -2 + self.proj = nn.Conv1d(channels + heads * nfreqs, channels, 1) + + def forward(self, x): + B, C, T = x.shape + heads = self.heads + indexes = torch.arange(T, device=x.device, dtype=x.dtype) + # left index are keys, right index are queries + delta = indexes[:, None] - indexes[None, :] + + queries = self.query(x).view(B, heads, -1, T) + keys = self.key(x).view(B, heads, -1, T) + # t are keys, s are queries + dots = torch.einsum("bhct,bhcs->bhts", keys, queries) + dots /= keys.shape[2]**0.5 + if self.nfreqs: + periods = torch.arange(1, self.nfreqs + 1, device=x.device, dtype=x.dtype) + freq_kernel = torch.cos(2 * math.pi * delta / periods.view(-1, 1, 1)) + freq_q = self.query_freqs(x).view(B, heads, -1, T) / self.nfreqs ** 0.5 + dots += torch.einsum("fts,bhfs->bhts", freq_kernel, freq_q) + if self.ndecay: + decays = torch.arange(1, self.ndecay + 1, device=x.device, dtype=x.dtype) + decay_q = self.query_decay(x).view(B, heads, -1, T) + decay_q = torch.sigmoid(decay_q) / 2 + decay_kernel = - decays.view(-1, 1, 1) * delta.abs() / self.ndecay**0.5 + dots += torch.einsum("fts,bhfs->bhts", decay_kernel, decay_q) + + # Kill self reference. + dots.masked_fill_(torch.eye(T, device=dots.device, dtype=torch.bool), -100) + weights = torch.softmax(dots, dim=2) + + content = self.content(x).view(B, heads, -1, T) + result = torch.einsum("bhts,bhct->bhcs", weights, content) + if self.nfreqs: + time_sig = torch.einsum("bhts,fts->bhfs", weights, freq_kernel) + result = torch.cat([result, time_sig], 2) + result = result.reshape(B, -1, T) + return x + self.proj(result) + + +class Demucs(nn.Module): + @capture_init + def __init__(self, + sources, + # Channels + audio_channels=2, + channels=64, + growth=2., + # Main structure + depth=6, + rewrite=True, + lstm_layers=0, + # Convolutions + kernel_size=8, + stride=4, + context=1, + # Activations + gelu=True, + glu=True, + # Normalization + norm_starts=4, + norm_groups=4, + # DConv residual branch + dconv_mode=1, + dconv_depth=2, + dconv_comp=4, + dconv_attn=4, + dconv_lstm=4, + dconv_init=1e-4, + # Pre/post processing + normalize=True, + resample=True, + # Weight init + rescale=0.1, + # Metadata + samplerate=44100, + segment=4 * 10): + """ + Args: + sources (list[str]): list of source names + audio_channels (int): stereo or mono + channels (int): first convolution channels + depth (int): number of encoder/decoder layers + growth (float): multiply (resp divide) number of channels by that + for each layer of the encoder (resp decoder) + depth (int): number of layers in the encoder and in the decoder. + rewrite (bool): add 1x1 convolution to each layer. + lstm_layers (int): number of lstm layers, 0 = no lstm. Deactivated + by default, as this is now replaced by the smaller and faster small LSTMs + in the DConv branches. + kernel_size (int): kernel size for convolutions + stride (int): stride for convolutions + context (int): kernel size of the convolution in the + decoder before the transposed convolution. If > 1, + will provide some context from neighboring time steps. + gelu: use GELU activation function. + glu (bool): use glu instead of ReLU for the 1x1 rewrite conv. + norm_starts: layer at which group norm starts being used. + decoder layers are numbered in reverse order. + norm_groups: number of groups for group norm. + dconv_mode: if 1: dconv in encoder only, 2: decoder only, 3: both. + dconv_depth: depth of residual DConv branch. + dconv_comp: compression of DConv branch. + dconv_attn: adds attention layers in DConv branch starting at this layer. + dconv_lstm: adds a LSTM layer in DConv branch starting at this layer. + dconv_init: initial scale for the DConv branch LayerScale. + normalize (bool): normalizes the input audio on the fly, and scales back + the output by the same amount. + resample (bool): upsample x2 the input and downsample /2 the output. + rescale (int): rescale initial weights of convolutions + to get their standard deviation closer to `rescale`. + samplerate (int): stored as meta information for easing + future evaluations of the model. + segment (float): duration of the chunks of audio to ideally evaluate the model on. + This is used by `demucs.apply.apply_model`. + """ + + super().__init__() + self.audio_channels = audio_channels + self.sources = sources + self.kernel_size = kernel_size + self.context = context + self.stride = stride + self.depth = depth + self.resample = resample + self.channels = channels + self.normalize = normalize + self.samplerate = samplerate + self.segment = segment + self.encoder = nn.ModuleList() + self.decoder = nn.ModuleList() + self.skip_scales = nn.ModuleList() + + if glu: + activation = nn.GLU(dim=1) + ch_scale = 2 + else: + activation = nn.ReLU() + ch_scale = 1 + if gelu: + act2 = nn.GELU + else: + act2 = nn.ReLU + + in_channels = audio_channels + padding = 0 + for index in range(depth): + norm_fn = lambda d: nn.Identity() # noqa + if index >= norm_starts: + norm_fn = lambda d: nn.GroupNorm(norm_groups, d) # noqa + + encode = [] + encode += [ + nn.Conv1d(in_channels, channels, kernel_size, stride), + norm_fn(channels), + act2(), + ] + attn = index >= dconv_attn + lstm = index >= dconv_lstm + if dconv_mode & 1: + encode += [DConv(channels, depth=dconv_depth, init=dconv_init, + compress=dconv_comp, attn=attn, lstm=lstm)] + if rewrite: + encode += [ + nn.Conv1d(channels, ch_scale * channels, 1), + norm_fn(ch_scale * channels), activation] + self.encoder.append(nn.Sequential(*encode)) + + decode = [] + if index > 0: + out_channels = in_channels + else: + out_channels = len(self.sources) * audio_channels + if rewrite: + decode += [ + nn.Conv1d(channels, ch_scale * channels, 2 * context + 1, padding=context), + norm_fn(ch_scale * channels), activation] + if dconv_mode & 2: + decode += [DConv(channels, depth=dconv_depth, init=dconv_init, + compress=dconv_comp, attn=attn, lstm=lstm)] + decode += [nn.ConvTranspose1d(channels, out_channels, + kernel_size, stride, padding=padding)] + if index > 0: + decode += [norm_fn(out_channels), act2()] + self.decoder.insert(0, nn.Sequential(*decode)) + in_channels = channels + channels = int(growth * channels) + + channels = in_channels + if lstm_layers: + self.lstm = BLSTM(channels, lstm_layers) + else: + self.lstm = None + + if rescale: + rescale_module(self, reference=rescale) + + def valid_length(self, length): + """ + Return the nearest valid length to use with the model so that + there is no time steps left over in a convolution, e.g. for all + layers, size of the input - kernel_size % stride = 0. + + Note that input are automatically padded if necessary to ensure that the output + has the same length as the input. + """ + if self.resample: + length *= 2 + + for _ in range(self.depth): + length = math.ceil((length - self.kernel_size) / self.stride) + 1 + length = max(1, length) + + for idx in range(self.depth): + length = (length - 1) * self.stride + self.kernel_size + + if self.resample: + length = math.ceil(length / 2) + return int(length) + + def forward(self, mix): + x = mix + length = x.shape[-1] + + if self.normalize: + mono = mix.mean(dim=1, keepdim=True) + mean = mono.mean(dim=-1, keepdim=True) + std = mono.std(dim=-1, keepdim=True) + x = (x - mean) / (1e-5 + std) + else: + mean = 0 + std = 1 + + delta = self.valid_length(length) - length + x = F.pad(x, (delta // 2, delta - delta // 2)) + + if self.resample: + x = julius.resample_frac(x, 1, 2) + + saved = [] + for encode in self.encoder: + x = encode(x) + saved.append(x) + + if self.lstm: + x = self.lstm(x) + + for decode in self.decoder: + skip = saved.pop(-1) + skip = center_trim(skip, x) + x = decode(x + skip) + + if self.resample: + x = julius.resample_frac(x, 2, 1) + x = x * std + mean + x = center_trim(x, length) + x = x.view(x.size(0), len(self.sources), self.audio_channels, x.size(-1)) + return x + + def load_state_dict(self, state, strict=True): + # fix a mismatch with previous generation Demucs models. + for idx in range(self.depth): + for a in ['encoder', 'decoder']: + for b in ['bias', 'weight']: + new = f'{a}.{idx}.3.{b}' + old = f'{a}.{idx}.2.{b}' + if old in state and new not in state: + state[new] = state.pop(old) + super().load_state_dict(state, strict=strict) diff --git a/audio_separator/separator/uvr_lib_v5/demucs/filtering.py b/audio_separator/separator/uvr_lib_v5/demucs/filtering.py new file mode 100644 index 0000000..08a2c17 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/demucs/filtering.py @@ -0,0 +1,502 @@ +from typing import Optional +import torch +import torch.nn as nn +from torch import Tensor +from torch.utils.data import DataLoader + +def atan2(y, x): + r"""Element-wise arctangent function of y/x. + Returns a new tensor with signed angles in radians. + It is an alternative implementation of torch.atan2 + + Args: + y (Tensor): First input tensor + x (Tensor): Second input tensor [shape=y.shape] + + Returns: + Tensor: [shape=y.shape]. + """ + pi = 2 * torch.asin(torch.tensor(1.0)) + x += ((x == 0) & (y == 0)) * 1.0 + out = torch.atan(y / x) + out += ((y >= 0) & (x < 0)) * pi + out -= ((y < 0) & (x < 0)) * pi + out *= 1 - ((y > 0) & (x == 0)) * 1.0 + out += ((y > 0) & (x == 0)) * (pi / 2) + out *= 1 - ((y < 0) & (x == 0)) * 1.0 + out += ((y < 0) & (x == 0)) * (-pi / 2) + return out + + +# Define basic complex operations on torch.Tensor objects whose last dimension +# consists in the concatenation of the real and imaginary parts. + + +def _norm(x: torch.Tensor) -> torch.Tensor: + r"""Computes the norm value of a torch Tensor, assuming that it + comes as real and imaginary part in its last dimension. + + Args: + x (Tensor): Input Tensor of shape [shape=(..., 2)] + + Returns: + Tensor: shape as x excluding the last dimension. + """ + return torch.abs(x[..., 0]) ** 2 + torch.abs(x[..., 1]) ** 2 + + +def _mul_add(a: torch.Tensor, b: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor: + """Element-wise multiplication of two complex Tensors described + through their real and imaginary parts. + The result is added to the `out` tensor""" + + # check `out` and allocate it if needed + target_shape = torch.Size([max(sa, sb) for (sa, sb) in zip(a.shape, b.shape)]) + if out is None or out.shape != target_shape: + out = torch.zeros(target_shape, dtype=a.dtype, device=a.device) + if out is a: + real_a = a[..., 0] + out[..., 0] = out[..., 0] + (real_a * b[..., 0] - a[..., 1] * b[..., 1]) + out[..., 1] = out[..., 1] + (real_a * b[..., 1] + a[..., 1] * b[..., 0]) + else: + out[..., 0] = out[..., 0] + (a[..., 0] * b[..., 0] - a[..., 1] * b[..., 1]) + out[..., 1] = out[..., 1] + (a[..., 0] * b[..., 1] + a[..., 1] * b[..., 0]) + return out + + +def _mul(a: torch.Tensor, b: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor: + """Element-wise multiplication of two complex Tensors described + through their real and imaginary parts + can work in place in case out is a only""" + target_shape = torch.Size([max(sa, sb) for (sa, sb) in zip(a.shape, b.shape)]) + if out is None or out.shape != target_shape: + out = torch.zeros(target_shape, dtype=a.dtype, device=a.device) + if out is a: + real_a = a[..., 0] + out[..., 0] = real_a * b[..., 0] - a[..., 1] * b[..., 1] + out[..., 1] = real_a * b[..., 1] + a[..., 1] * b[..., 0] + else: + out[..., 0] = a[..., 0] * b[..., 0] - a[..., 1] * b[..., 1] + out[..., 1] = a[..., 0] * b[..., 1] + a[..., 1] * b[..., 0] + return out + + +def _inv(z: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor: + """Element-wise multiplicative inverse of a Tensor with complex + entries described through their real and imaginary parts. + can work in place in case out is z""" + ez = _norm(z) + if out is None or out.shape != z.shape: + out = torch.zeros_like(z) + out[..., 0] = z[..., 0] / ez + out[..., 1] = -z[..., 1] / ez + return out + + +def _conj(z, out: Optional[torch.Tensor] = None) -> torch.Tensor: + """Element-wise complex conjugate of a Tensor with complex entries + described through their real and imaginary parts. + can work in place in case out is z""" + if out is None or out.shape != z.shape: + out = torch.zeros_like(z) + out[..., 0] = z[..., 0] + out[..., 1] = -z[..., 1] + return out + + +def _invert(M: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor: + """ + Invert 1x1 or 2x2 matrices + + Will generate errors if the matrices are singular: user must handle this + through his own regularization schemes. + + Args: + M (Tensor): [shape=(..., nb_channels, nb_channels, 2)] + matrices to invert: must be square along dimensions -3 and -2 + + Returns: + invM (Tensor): [shape=M.shape] + inverses of M + """ + nb_channels = M.shape[-2] + + if out is None or out.shape != M.shape: + out = torch.empty_like(M) + + if nb_channels == 1: + # scalar case + out = _inv(M, out) + elif nb_channels == 2: + # two channels case: analytical expression + + # first compute the determinent + det = _mul(M[..., 0, 0, :], M[..., 1, 1, :]) + det = det - _mul(M[..., 0, 1, :], M[..., 1, 0, :]) + # invert it + invDet = _inv(det) + + # then fill out the matrix with the inverse + out[..., 0, 0, :] = _mul(invDet, M[..., 1, 1, :], out[..., 0, 0, :]) + out[..., 1, 0, :] = _mul(-invDet, M[..., 1, 0, :], out[..., 1, 0, :]) + out[..., 0, 1, :] = _mul(-invDet, M[..., 0, 1, :], out[..., 0, 1, :]) + out[..., 1, 1, :] = _mul(invDet, M[..., 0, 0, :], out[..., 1, 1, :]) + else: + raise Exception("Only 2 channels are supported for the torch version.") + return out + + +# Now define the signal-processing low-level functions used by the Separator + + +def expectation_maximization( + y: torch.Tensor, + x: torch.Tensor, + iterations: int = 2, + eps: float = 1e-10, + batch_size: int = 200, +): + r"""Expectation maximization algorithm, for refining source separation + estimates. + + This algorithm allows to make source separation results better by + enforcing multichannel consistency for the estimates. This usually means + a better perceptual quality in terms of spatial artifacts. + + The implementation follows the details presented in [1]_, taking + inspiration from the original EM algorithm proposed in [2]_ and its + weighted refinement proposed in [3]_, [4]_. + It works by iteratively: + + * Re-estimate source parameters (power spectral densities and spatial + covariance matrices) through :func:`get_local_gaussian_model`. + + * Separate again the mixture with the new parameters by first computing + the new modelled mixture covariance matrices with :func:`get_mix_model`, + prepare the Wiener filters through :func:`wiener_gain` and apply them + with :func:`apply_filter``. + + References + ---------- + .. [1] S. Uhlich and M. Porcu and F. Giron and M. Enenkl and T. Kemp and + N. Takahashi and Y. Mitsufuji, "Improving music source separation based + on deep neural networks through data augmentation and network + blending." 2017 IEEE International Conference on Acoustics, Speech + and Signal Processing (ICASSP). IEEE, 2017. + + .. [2] N.Q. Duong and E. Vincent and R.Gribonval. "Under-determined + reverberant audio source separation using a full-rank spatial + covariance model." IEEE Transactions on Audio, Speech, and Language + Processing 18.7 (2010): 1830-1840. + + .. [3] A. Nugraha and A. Liutkus and E. Vincent. "Multichannel audio source + separation with deep neural networks." IEEE/ACM Transactions on Audio, + Speech, and Language Processing 24.9 (2016): 1652-1664. + + .. [4] A. Nugraha and A. Liutkus and E. Vincent. "Multichannel music + separation with deep neural networks." 2016 24th European Signal + Processing Conference (EUSIPCO). IEEE, 2016. + + .. [5] A. Liutkus and R. Badeau and G. Richard "Kernel additive models for + source separation." IEEE Transactions on Signal Processing + 62.16 (2014): 4298-4310. + + Args: + y (Tensor): [shape=(nb_frames, nb_bins, nb_channels, 2, nb_sources)] + initial estimates for the sources + x (Tensor): [shape=(nb_frames, nb_bins, nb_channels, 2)] + complex STFT of the mixture signal + iterations (int): [scalar] + number of iterations for the EM algorithm. + eps (float or None): [scalar] + The epsilon value to use for regularization and filters. + + Returns: + y (Tensor): [shape=(nb_frames, nb_bins, nb_channels, 2, nb_sources)] + estimated sources after iterations + v (Tensor): [shape=(nb_frames, nb_bins, nb_sources)] + estimated power spectral densities + R (Tensor): [shape=(nb_bins, nb_channels, nb_channels, 2, nb_sources)] + estimated spatial covariance matrices + + Notes: + * You need an initial estimate for the sources to apply this + algorithm. This is precisely what the :func:`wiener` function does. + * This algorithm *is not* an implementation of the "exact" EM + proposed in [1]_. In particular, it does compute the posterior + covariance matrices the same (exact) way. Instead, it uses the + simplified approximate scheme initially proposed in [5]_ and further + refined in [3]_, [4]_, that boils down to just take the empirical + covariance of the recent source estimates, followed by a weighted + average for the update of the spatial covariance matrix. It has been + empirically demonstrated that this simplified algorithm is more + robust for music separation. + + Warning: + It is *very* important to make sure `x.dtype` is `torch.float64` + if you want double precision, because this function will **not** + do such conversion for you from `torch.complex32`, in case you want the + smaller RAM usage on purpose. + + It is usually always better in terms of quality to have double + precision, by e.g. calling :func:`expectation_maximization` + with ``x.to(torch.float64)``. + """ + # dimensions + (nb_frames, nb_bins, nb_channels) = x.shape[:-1] + nb_sources = y.shape[-1] + + regularization = torch.cat( + ( + torch.eye(nb_channels, dtype=x.dtype, device=x.device)[..., None], + torch.zeros((nb_channels, nb_channels, 1), dtype=x.dtype, device=x.device), + ), + dim=2, + ) + regularization = torch.sqrt(torch.as_tensor(eps)) * ( + regularization[None, None, ...].expand((-1, nb_bins, -1, -1, -1)) + ) + + # allocate the spatial covariance matrices + R = [ + torch.zeros((nb_bins, nb_channels, nb_channels, 2), dtype=x.dtype, device=x.device) + for j in range(nb_sources) + ] + weight: torch.Tensor = torch.zeros((nb_bins,), dtype=x.dtype, device=x.device) + + v: torch.Tensor = torch.zeros((nb_frames, nb_bins, nb_sources), dtype=x.dtype, device=x.device) + for it in range(iterations): + # constructing the mixture covariance matrix. Doing it with a loop + # to avoid storing anytime in RAM the whole 6D tensor + + # update the PSD as the average spectrogram over channels + v = torch.mean(torch.abs(y[..., 0, :]) ** 2 + torch.abs(y[..., 1, :]) ** 2, dim=-2) + + # update spatial covariance matrices (weighted update) + for j in range(nb_sources): + R[j] = torch.tensor(0.0, device=x.device) + weight = torch.tensor(eps, device=x.device) + pos: int = 0 + batch_size = batch_size if batch_size else nb_frames + while pos < nb_frames: + t = torch.arange(pos, min(nb_frames, pos + batch_size)) + pos = int(t[-1]) + 1 + + R[j] = R[j] + torch.sum(_covariance(y[t, ..., j]), dim=0) + weight = weight + torch.sum(v[t, ..., j], dim=0) + R[j] = R[j] / weight[..., None, None, None] + weight = torch.zeros_like(weight) + + # cloning y if we track gradient, because we're going to update it + if y.requires_grad: + y = y.clone() + + pos = 0 + while pos < nb_frames: + t = torch.arange(pos, min(nb_frames, pos + batch_size)) + pos = int(t[-1]) + 1 + + y[t, ...] = torch.tensor(0.0, device=x.device, dtype=x.dtype) + + # compute mix covariance matrix + Cxx = regularization + for j in range(nb_sources): + Cxx = Cxx + (v[t, ..., j, None, None, None] * R[j][None, ...].clone()) + + # invert it + inv_Cxx = _invert(Cxx) + + # separate the sources + for j in range(nb_sources): + + # create a wiener gain for this source + gain = torch.zeros_like(inv_Cxx) + + # computes multichannel Wiener gain as v_j R_j inv_Cxx + indices = torch.cartesian_prod( + torch.arange(nb_channels), + torch.arange(nb_channels), + torch.arange(nb_channels), + ) + for index in indices: + gain[:, :, index[0], index[1], :] = _mul_add( + R[j][None, :, index[0], index[2], :].clone(), + inv_Cxx[:, :, index[2], index[1], :], + gain[:, :, index[0], index[1], :], + ) + gain = gain * v[t, ..., None, None, None, j] + + # apply it to the mixture + for i in range(nb_channels): + y[t, ..., j] = _mul_add(gain[..., i, :], x[t, ..., i, None, :], y[t, ..., j]) + + return y, v, R + + +def wiener( + targets_spectrograms: torch.Tensor, + mix_stft: torch.Tensor, + iterations: int = 1, + softmask: bool = False, + residual: bool = False, + scale_factor: float = 10.0, + eps: float = 1e-10, +): + """Wiener-based separation for multichannel audio. + + The method uses the (possibly multichannel) spectrograms of the + sources to separate the (complex) Short Term Fourier Transform of the + mix. Separation is done in a sequential way by: + + * Getting an initial estimate. This can be done in two ways: either by + directly using the spectrograms with the mixture phase, or + by using a softmasking strategy. This initial phase is controlled + by the `softmask` flag. + + * If required, adding an additional residual target as the mix minus + all targets. + + * Refinining these initial estimates through a call to + :func:`expectation_maximization` if the number of iterations is nonzero. + + This implementation also allows to specify the epsilon value used for + regularization. It is based on [1]_, [2]_, [3]_, [4]_. + + References + ---------- + .. [1] S. Uhlich and M. Porcu and F. Giron and M. Enenkl and T. Kemp and + N. Takahashi and Y. Mitsufuji, "Improving music source separation based + on deep neural networks through data augmentation and network + blending." 2017 IEEE International Conference on Acoustics, Speech + and Signal Processing (ICASSP). IEEE, 2017. + + .. [2] A. Nugraha and A. Liutkus and E. Vincent. "Multichannel audio source + separation with deep neural networks." IEEE/ACM Transactions on Audio, + Speech, and Language Processing 24.9 (2016): 1652-1664. + + .. [3] A. Nugraha and A. Liutkus and E. Vincent. "Multichannel music + separation with deep neural networks." 2016 24th European Signal + Processing Conference (EUSIPCO). IEEE, 2016. + + .. [4] A. Liutkus and R. Badeau and G. Richard "Kernel additive models for + source separation." IEEE Transactions on Signal Processing + 62.16 (2014): 4298-4310. + + Args: + targets_spectrograms (Tensor): spectrograms of the sources + [shape=(nb_frames, nb_bins, nb_channels, nb_sources)]. + This is a nonnegative tensor that is + usually the output of the actual separation method of the user. The + spectrograms may be mono, but they need to be 4-dimensional in all + cases. + mix_stft (Tensor): [shape=(nb_frames, nb_bins, nb_channels, complex=2)] + STFT of the mixture signal. + iterations (int): [scalar] + number of iterations for the EM algorithm + softmask (bool): Describes how the initial estimates are obtained. + * if `False`, then the mixture phase will directly be used with the + spectrogram as initial estimates. + * if `True`, initial estimates are obtained by multiplying the + complex mix element-wise with the ratio of each target spectrogram + with the sum of them all. This strategy is better if the model are + not really good, and worse otherwise. + residual (bool): if `True`, an additional target is created, which is + equal to the mixture minus the other targets, before application of + expectation maximization + eps (float): Epsilon value to use for computing the separations. + This is used whenever division with a model energy is + performed, i.e. when softmasking and when iterating the EM. + It can be understood as the energy of the additional white noise + that is taken out when separating. + + Returns: + Tensor: shape=(nb_frames, nb_bins, nb_channels, complex=2, nb_sources) + STFT of estimated sources + + Notes: + * Be careful that you need *magnitude spectrogram estimates* for the + case `softmask==False`. + * `softmask=False` is recommended + * The epsilon value will have a huge impact on performance. If it's + large, only the parts of the signal with a significant energy will + be kept in the sources. This epsilon then directly controls the + energy of the reconstruction error. + + Warning: + As in :func:`expectation_maximization`, we recommend converting the + mixture `x` to double precision `torch.float64` *before* calling + :func:`wiener`. + """ + if softmask: + # if we use softmask, we compute the ratio mask for all targets and + # multiply by the mix stft + y = ( + mix_stft[..., None] + * ( + targets_spectrograms + / (eps + torch.sum(targets_spectrograms, dim=-1, keepdim=True).to(mix_stft.dtype)) + )[..., None, :] + ) + else: + # otherwise, we just multiply the targets spectrograms with mix phase + # we tacitly assume that we have magnitude estimates. + angle = atan2(mix_stft[..., 1], mix_stft[..., 0])[..., None] + nb_sources = targets_spectrograms.shape[-1] + y = torch.zeros( + mix_stft.shape + (nb_sources,), dtype=mix_stft.dtype, device=mix_stft.device + ) + y[..., 0, :] = targets_spectrograms * torch.cos(angle) + y[..., 1, :] = targets_spectrograms * torch.sin(angle) + + if residual: + # if required, adding an additional target as the mix minus + # available targets + y = torch.cat([y, mix_stft[..., None] - y.sum(dim=-1, keepdim=True)], dim=-1) + + if iterations == 0: + return y + + # we need to refine the estimates. Scales down the estimates for + # numerical stability + max_abs = torch.max( + torch.as_tensor(1.0, dtype=mix_stft.dtype, device=mix_stft.device), + torch.sqrt(_norm(mix_stft)).max() / scale_factor, + ) + + mix_stft = mix_stft / max_abs + y = y / max_abs + + # call expectation maximization + y = expectation_maximization(y, mix_stft, iterations, eps=eps)[0] + + # scale estimates up again + y = y * max_abs + return y + + +def _covariance(y_j): + """ + Compute the empirical covariance for a source. + + Args: + y_j (Tensor): complex stft of the source. + [shape=(nb_frames, nb_bins, nb_channels, 2)]. + + Returns: + Cj (Tensor): [shape=(nb_frames, nb_bins, nb_channels, nb_channels, 2)] + just y_j * conj(y_j.T): empirical covariance for each TF bin. + """ + (nb_frames, nb_bins, nb_channels) = y_j.shape[:-1] + Cj = torch.zeros( + (nb_frames, nb_bins, nb_channels, nb_channels, 2), + dtype=y_j.dtype, + device=y_j.device, + ) + indices = torch.cartesian_prod(torch.arange(nb_channels), torch.arange(nb_channels)) + for index in indices: + Cj[:, :, index[0], index[1], :] = _mul_add( + y_j[:, :, index[0], :], + _conj(y_j[:, :, index[1], :]), + Cj[:, :, index[0], index[1], :], + ) + return Cj diff --git a/audio_separator/separator/uvr_lib_v5/demucs/hdemucs.py b/audio_separator/separator/uvr_lib_v5/demucs/hdemucs.py new file mode 100644 index 0000000..ea4e652 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/demucs/hdemucs.py @@ -0,0 +1,796 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +""" +This code contains the spectrogram and Hybrid version of Demucs. +""" +from copy import deepcopy +import math +import typing as tp +import torch +from torch import nn +from torch.nn import functional as F +from .filtering import wiener +from .demucs import DConv, rescale_module +from .states import capture_init +from .spec import spectro, ispectro + +def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = 'constant', value: float = 0.): + """Tiny wrapper around F.pad, just to allow for reflect padding on small input. + If this is the case, we insert extra 0 padding to the right before the reflection happen.""" + x0 = x + length = x.shape[-1] + padding_left, padding_right = paddings + if mode == 'reflect': + max_pad = max(padding_left, padding_right) + if length <= max_pad: + extra_pad = max_pad - length + 1 + extra_pad_right = min(padding_right, extra_pad) + extra_pad_left = extra_pad - extra_pad_right + paddings = (padding_left - extra_pad_left, padding_right - extra_pad_right) + x = F.pad(x, (extra_pad_left, extra_pad_right)) + out = F.pad(x, paddings, mode, value) + assert out.shape[-1] == length + padding_left + padding_right + assert (out[..., padding_left: padding_left + length] == x0).all() + return out + +class ScaledEmbedding(nn.Module): + """ + Boost learning rate for embeddings (with `scale`). + Also, can make embeddings continuous with `smooth`. + """ + def __init__(self, num_embeddings: int, embedding_dim: int, + scale: float = 10., smooth=False): + super().__init__() + self.embedding = nn.Embedding(num_embeddings, embedding_dim) + if smooth: + weight = torch.cumsum(self.embedding.weight.data, dim=0) + # when summing gaussian, overscale raises as sqrt(n), so we nornalize by that. + weight = weight / torch.arange(1, num_embeddings + 1).to(weight).sqrt()[:, None] + self.embedding.weight.data[:] = weight + self.embedding.weight.data /= scale + self.scale = scale + + @property + def weight(self): + return self.embedding.weight * self.scale + + def forward(self, x): + out = self.embedding(x) * self.scale + return out + + +class HEncLayer(nn.Module): + def __init__(self, chin, chout, kernel_size=8, stride=4, norm_groups=1, empty=False, + freq=True, dconv=True, norm=True, context=0, dconv_kw={}, pad=True, + rewrite=True): + """Encoder layer. This used both by the time and the frequency branch. + + Args: + chin: number of input channels. + chout: number of output channels. + norm_groups: number of groups for group norm. + empty: used to make a layer with just the first conv. this is used + before merging the time and freq. branches. + freq: this is acting on frequencies. + dconv: insert DConv residual branches. + norm: use GroupNorm. + context: context size for the 1x1 conv. + dconv_kw: list of kwargs for the DConv class. + pad: pad the input. Padding is done so that the output size is + always the input size / stride. + rewrite: add 1x1 conv at the end of the layer. + """ + super().__init__() + norm_fn = lambda d: nn.Identity() # noqa + if norm: + norm_fn = lambda d: nn.GroupNorm(norm_groups, d) # noqa + if pad: + pad = kernel_size // 4 + else: + pad = 0 + klass = nn.Conv1d + self.freq = freq + self.kernel_size = kernel_size + self.stride = stride + self.empty = empty + self.norm = norm + self.pad = pad + if freq: + kernel_size = [kernel_size, 1] + stride = [stride, 1] + pad = [pad, 0] + klass = nn.Conv2d + self.conv = klass(chin, chout, kernel_size, stride, pad) + if self.empty: + return + self.norm1 = norm_fn(chout) + self.rewrite = None + if rewrite: + self.rewrite = klass(chout, 2 * chout, 1 + 2 * context, 1, context) + self.norm2 = norm_fn(2 * chout) + + self.dconv = None + if dconv: + self.dconv = DConv(chout, **dconv_kw) + + def forward(self, x, inject=None): + """ + `inject` is used to inject the result from the time branch into the frequency branch, + when both have the same stride. + """ + if not self.freq and x.dim() == 4: + B, C, Fr, T = x.shape + x = x.view(B, -1, T) + + if not self.freq: + le = x.shape[-1] + if not le % self.stride == 0: + x = F.pad(x, (0, self.stride - (le % self.stride))) + y = self.conv(x) + if self.empty: + return y + if inject is not None: + assert inject.shape[-1] == y.shape[-1], (inject.shape, y.shape) + if inject.dim() == 3 and y.dim() == 4: + inject = inject[:, :, None] + y = y + inject + y = F.gelu(self.norm1(y)) + if self.dconv: + if self.freq: + B, C, Fr, T = y.shape + y = y.permute(0, 2, 1, 3).reshape(-1, C, T) + y = self.dconv(y) + if self.freq: + y = y.view(B, Fr, C, T).permute(0, 2, 1, 3) + if self.rewrite: + z = self.norm2(self.rewrite(y)) + z = F.glu(z, dim=1) + else: + z = y + return z + + +class MultiWrap(nn.Module): + """ + Takes one layer and replicate it N times. each replica will act + on a frequency band. All is done so that if the N replica have the same weights, + then this is exactly equivalent to applying the original module on all frequencies. + + This is a bit over-engineered to avoid edge artifacts when splitting + the frequency bands, but it is possible the naive implementation would work as well... + """ + def __init__(self, layer, split_ratios): + """ + Args: + layer: module to clone, must be either HEncLayer or HDecLayer. + split_ratios: list of float indicating which ratio to keep for each band. + """ + super().__init__() + self.split_ratios = split_ratios + self.layers = nn.ModuleList() + self.conv = isinstance(layer, HEncLayer) + assert not layer.norm + assert layer.freq + assert layer.pad + if not self.conv: + assert not layer.context_freq + for k in range(len(split_ratios) + 1): + lay = deepcopy(layer) + if self.conv: + lay.conv.padding = (0, 0) + else: + lay.pad = False + for m in lay.modules(): + if hasattr(m, 'reset_parameters'): + m.reset_parameters() + self.layers.append(lay) + + def forward(self, x, skip=None, length=None): + B, C, Fr, T = x.shape + + ratios = list(self.split_ratios) + [1] + start = 0 + outs = [] + for ratio, layer in zip(ratios, self.layers): + if self.conv: + pad = layer.kernel_size // 4 + if ratio == 1: + limit = Fr + frames = -1 + else: + limit = int(round(Fr * ratio)) + le = limit - start + if start == 0: + le += pad + frames = round((le - layer.kernel_size) / layer.stride + 1) + limit = start + (frames - 1) * layer.stride + layer.kernel_size + if start == 0: + limit -= pad + assert limit - start > 0, (limit, start) + assert limit <= Fr, (limit, Fr) + y = x[:, :, start:limit, :] + if start == 0: + y = F.pad(y, (0, 0, pad, 0)) + if ratio == 1: + y = F.pad(y, (0, 0, 0, pad)) + outs.append(layer(y)) + start = limit - layer.kernel_size + layer.stride + else: + if ratio == 1: + limit = Fr + else: + limit = int(round(Fr * ratio)) + last = layer.last + layer.last = True + + y = x[:, :, start:limit] + s = skip[:, :, start:limit] + out, _ = layer(y, s, None) + if outs: + outs[-1][:, :, -layer.stride:] += ( + out[:, :, :layer.stride] - layer.conv_tr.bias.view(1, -1, 1, 1)) + out = out[:, :, layer.stride:] + if ratio == 1: + out = out[:, :, :-layer.stride // 2, :] + if start == 0: + out = out[:, :, layer.stride // 2:, :] + outs.append(out) + layer.last = last + start = limit + out = torch.cat(outs, dim=2) + if not self.conv and not last: + out = F.gelu(out) + if self.conv: + return out + else: + return out, None + + +class HDecLayer(nn.Module): + def __init__(self, chin, chout, last=False, kernel_size=8, stride=4, norm_groups=1, empty=False, + freq=True, dconv=True, norm=True, context=1, dconv_kw={}, pad=True, + context_freq=True, rewrite=True): + """ + Same as HEncLayer but for decoder. See `HEncLayer` for documentation. + """ + super().__init__() + norm_fn = lambda d: nn.Identity() # noqa + if norm: + norm_fn = lambda d: nn.GroupNorm(norm_groups, d) # noqa + if pad: + pad = kernel_size // 4 + else: + pad = 0 + self.pad = pad + self.last = last + self.freq = freq + self.chin = chin + self.empty = empty + self.stride = stride + self.kernel_size = kernel_size + self.norm = norm + self.context_freq = context_freq + klass = nn.Conv1d + klass_tr = nn.ConvTranspose1d + if freq: + kernel_size = [kernel_size, 1] + stride = [stride, 1] + klass = nn.Conv2d + klass_tr = nn.ConvTranspose2d + self.conv_tr = klass_tr(chin, chout, kernel_size, stride) + self.norm2 = norm_fn(chout) + if self.empty: + return + self.rewrite = None + if rewrite: + if context_freq: + self.rewrite = klass(chin, 2 * chin, 1 + 2 * context, 1, context) + else: + self.rewrite = klass(chin, 2 * chin, [1, 1 + 2 * context], 1, + [0, context]) + self.norm1 = norm_fn(2 * chin) + + self.dconv = None + if dconv: + self.dconv = DConv(chin, **dconv_kw) + + def forward(self, x, skip, length): + if self.freq and x.dim() == 3: + B, C, T = x.shape + x = x.view(B, self.chin, -1, T) + + if not self.empty: + x = x + skip + + if self.rewrite: + y = F.glu(self.norm1(self.rewrite(x)), dim=1) + else: + y = x + if self.dconv: + if self.freq: + B, C, Fr, T = y.shape + y = y.permute(0, 2, 1, 3).reshape(-1, C, T) + y = self.dconv(y) + if self.freq: + y = y.view(B, Fr, C, T).permute(0, 2, 1, 3) + else: + y = x + assert skip is None + z = self.norm2(self.conv_tr(y)) + if self.freq: + if self.pad: + z = z[..., self.pad:-self.pad, :] + else: + z = z[..., self.pad:self.pad + length] + assert z.shape[-1] == length, (z.shape[-1], length) + if not self.last: + z = F.gelu(z) + return z, y + + +class HDemucs(nn.Module): + """ + Spectrogram and hybrid Demucs model. + The spectrogram model has the same structure as Demucs, except the first few layers are over the + frequency axis, until there is only 1 frequency, and then it moves to time convolutions. + Frequency layers can still access information across time steps thanks to the DConv residual. + + Hybrid model have a parallel time branch. At some layer, the time branch has the same stride + as the frequency branch and then the two are combined. The opposite happens in the decoder. + + Models can either use naive iSTFT from masking, Wiener filtering ([Ulhih et al. 2017]), + or complex as channels (CaC) [Choi et al. 2020]. Wiener filtering is based on + Open Unmix implementation [Stoter et al. 2019]. + + The loss is always on the temporal domain, by backpropagating through the above + output methods and iSTFT. This allows to define hybrid models nicely. However, this breaks + a bit Wiener filtering, as doing more iteration at test time will change the spectrogram + contribution, without changing the one from the waveform, which will lead to worse performance. + I tried using the residual option in OpenUnmix Wiener implementation, but it didn't improve. + CaC on the other hand provides similar performance for hybrid, and works naturally with + hybrid models. + + This model also uses frequency embeddings are used to improve efficiency on convolutions + over the freq. axis, following [Isik et al. 2020] (https://arxiv.org/pdf/2008.04470.pdf). + + Unlike classic Demucs, there is no resampling here, and normalization is always applied. + """ + @capture_init + def __init__(self, + sources, + # Channels + audio_channels=2, + channels=48, + channels_time=None, + growth=2, + # STFT + nfft=4096, + wiener_iters=0, + end_iters=0, + wiener_residual=False, + cac=True, + # Main structure + depth=6, + rewrite=True, + hybrid=True, + hybrid_old=False, + # Frequency branch + multi_freqs=None, + multi_freqs_depth=2, + freq_emb=0.2, + emb_scale=10, + emb_smooth=True, + # Convolutions + kernel_size=8, + time_stride=2, + stride=4, + context=1, + context_enc=0, + # Normalization + norm_starts=4, + norm_groups=4, + # DConv residual branch + dconv_mode=1, + dconv_depth=2, + dconv_comp=4, + dconv_attn=4, + dconv_lstm=4, + dconv_init=1e-4, + # Weight init + rescale=0.1, + # Metadata + samplerate=44100, + segment=4 * 10): + + """ + Args: + sources (list[str]): list of source names. + audio_channels (int): input/output audio channels. + channels (int): initial number of hidden channels. + channels_time: if not None, use a different `channels` value for the time branch. + growth: increase the number of hidden channels by this factor at each layer. + nfft: number of fft bins. Note that changing this require careful computation of + various shape parameters and will not work out of the box for hybrid models. + wiener_iters: when using Wiener filtering, number of iterations at test time. + end_iters: same but at train time. For a hybrid model, must be equal to `wiener_iters`. + wiener_residual: add residual source before wiener filtering. + cac: uses complex as channels, i.e. complex numbers are 2 channels each + in input and output. no further processing is done before ISTFT. + depth (int): number of layers in the encoder and in the decoder. + rewrite (bool): add 1x1 convolution to each layer. + hybrid (bool): make a hybrid time/frequency domain, otherwise frequency only. + hybrid_old: some models trained for MDX had a padding bug. This replicates + this bug to avoid retraining them. + multi_freqs: list of frequency ratios for splitting frequency bands with `MultiWrap`. + multi_freqs_depth: how many layers to wrap with `MultiWrap`. Only the outermost + layers will be wrapped. + freq_emb: add frequency embedding after the first frequency layer if > 0, + the actual value controls the weight of the embedding. + emb_scale: equivalent to scaling the embedding learning rate + emb_smooth: initialize the embedding with a smooth one (with respect to frequencies). + kernel_size: kernel_size for encoder and decoder layers. + stride: stride for encoder and decoder layers. + time_stride: stride for the final time layer, after the merge. + context: context for 1x1 conv in the decoder. + context_enc: context for 1x1 conv in the encoder. + norm_starts: layer at which group norm starts being used. + decoder layers are numbered in reverse order. + norm_groups: number of groups for group norm. + dconv_mode: if 1: dconv in encoder only, 2: decoder only, 3: both. + dconv_depth: depth of residual DConv branch. + dconv_comp: compression of DConv branch. + dconv_attn: adds attention layers in DConv branch starting at this layer. + dconv_lstm: adds a LSTM layer in DConv branch starting at this layer. + dconv_init: initial scale for the DConv branch LayerScale. + rescale: weight recaling trick + + """ + super().__init__() + + self.cac = cac + self.wiener_residual = wiener_residual + self.audio_channels = audio_channels + self.sources = sources + self.kernel_size = kernel_size + self.context = context + self.stride = stride + self.depth = depth + self.channels = channels + self.samplerate = samplerate + self.segment = segment + + self.nfft = nfft + self.hop_length = nfft // 4 + self.wiener_iters = wiener_iters + self.end_iters = end_iters + self.freq_emb = None + self.hybrid = hybrid + self.hybrid_old = hybrid_old + if hybrid_old: + assert hybrid, "hybrid_old must come with hybrid=True" + if hybrid: + assert wiener_iters == end_iters + + self.encoder = nn.ModuleList() + self.decoder = nn.ModuleList() + + if hybrid: + self.tencoder = nn.ModuleList() + self.tdecoder = nn.ModuleList() + + chin = audio_channels + chin_z = chin # number of channels for the freq branch + if self.cac: + chin_z *= 2 + chout = channels_time or channels + chout_z = channels + freqs = nfft // 2 + + for index in range(depth): + lstm = index >= dconv_lstm + attn = index >= dconv_attn + norm = index >= norm_starts + freq = freqs > 1 + stri = stride + ker = kernel_size + if not freq: + assert freqs == 1 + ker = time_stride * 2 + stri = time_stride + + pad = True + last_freq = False + if freq and freqs <= kernel_size: + ker = freqs + pad = False + last_freq = True + + kw = { + 'kernel_size': ker, + 'stride': stri, + 'freq': freq, + 'pad': pad, + 'norm': norm, + 'rewrite': rewrite, + 'norm_groups': norm_groups, + 'dconv_kw': { + 'lstm': lstm, + 'attn': attn, + 'depth': dconv_depth, + 'compress': dconv_comp, + 'init': dconv_init, + 'gelu': True, + } + } + kwt = dict(kw) + kwt['freq'] = 0 + kwt['kernel_size'] = kernel_size + kwt['stride'] = stride + kwt['pad'] = True + kw_dec = dict(kw) + multi = False + if multi_freqs and index < multi_freqs_depth: + multi = True + kw_dec['context_freq'] = False + + if last_freq: + chout_z = max(chout, chout_z) + chout = chout_z + + enc = HEncLayer(chin_z, chout_z, + dconv=dconv_mode & 1, context=context_enc, **kw) + if hybrid and freq: + tenc = HEncLayer(chin, chout, dconv=dconv_mode & 1, context=context_enc, + empty=last_freq, **kwt) + self.tencoder.append(tenc) + + if multi: + enc = MultiWrap(enc, multi_freqs) + self.encoder.append(enc) + if index == 0: + chin = self.audio_channels * len(self.sources) + chin_z = chin + if self.cac: + chin_z *= 2 + dec = HDecLayer(chout_z, chin_z, dconv=dconv_mode & 2, + last=index == 0, context=context, **kw_dec) + if multi: + dec = MultiWrap(dec, multi_freqs) + if hybrid and freq: + tdec = HDecLayer(chout, chin, dconv=dconv_mode & 2, empty=last_freq, + last=index == 0, context=context, **kwt) + self.tdecoder.insert(0, tdec) + self.decoder.insert(0, dec) + + chin = chout + chin_z = chout_z + chout = int(growth * chout) + chout_z = int(growth * chout_z) + if freq: + if freqs <= kernel_size: + freqs = 1 + else: + freqs //= stride + if index == 0 and freq_emb: + self.freq_emb = ScaledEmbedding( + freqs, chin_z, smooth=emb_smooth, scale=emb_scale) + self.freq_emb_scale = freq_emb + + if rescale: + rescale_module(self, reference=rescale) + + def _spec(self, x): + hl = self.hop_length + nfft = self.nfft + x0 = x # noqa + + if self.hybrid: + # We re-pad the signal in order to keep the property + # that the size of the output is exactly the size of the input + # divided by the stride (here hop_length), when divisible. + # This is achieved by padding by 1/4th of the kernel size (here nfft). + # which is not supported by torch.stft. + # Having all convolution operations follow this convention allow to easily + # align the time and frequency branches later on. + assert hl == nfft // 4 + le = int(math.ceil(x.shape[-1] / hl)) + pad = hl // 2 * 3 + if not self.hybrid_old: + x = pad1d(x, (pad, pad + le * hl - x.shape[-1]), mode='reflect') + else: + x = pad1d(x, (pad, pad + le * hl - x.shape[-1])) + + z = spectro(x, nfft, hl)[..., :-1, :] + if self.hybrid: + assert z.shape[-1] == le + 4, (z.shape, x.shape, le) + z = z[..., 2:2+le] + return z + + def _ispec(self, z, length=None, scale=0): + hl = self.hop_length // (4 ** scale) + z = F.pad(z, (0, 0, 0, 1)) + if self.hybrid: + z = F.pad(z, (2, 2)) + pad = hl // 2 * 3 + if not self.hybrid_old: + le = hl * int(math.ceil(length / hl)) + 2 * pad + else: + le = hl * int(math.ceil(length / hl)) + x = ispectro(z, hl, length=le) + if not self.hybrid_old: + x = x[..., pad:pad + length] + else: + x = x[..., :length] + else: + x = ispectro(z, hl, length) + return x + + def _magnitude(self, z): + # return the magnitude of the spectrogram, except when cac is True, + # in which case we just move the complex dimension to the channel one. + if self.cac: + B, C, Fr, T = z.shape + m = torch.view_as_real(z).permute(0, 1, 4, 2, 3) + m = m.reshape(B, C * 2, Fr, T) + else: + m = z.abs() + return m + + def _mask(self, z, m): + # Apply masking given the mixture spectrogram `z` and the estimated mask `m`. + # If `cac` is True, `m` is actually a full spectrogram and `z` is ignored. + niters = self.wiener_iters + if self.cac: + B, S, C, Fr, T = m.shape + out = m.view(B, S, -1, 2, Fr, T).permute(0, 1, 2, 4, 5, 3) + out = torch.view_as_complex(out.contiguous()) + return out + if self.training: + niters = self.end_iters + if niters < 0: + z = z[:, None] + return z / (1e-8 + z.abs()) * m + else: + return self._wiener(m, z, niters) + + def _wiener(self, mag_out, mix_stft, niters): + # apply wiener filtering from OpenUnmix. + init = mix_stft.dtype + wiener_win_len = 300 + residual = self.wiener_residual + + B, S, C, Fq, T = mag_out.shape + mag_out = mag_out.permute(0, 4, 3, 2, 1) + mix_stft = torch.view_as_real(mix_stft.permute(0, 3, 2, 1)) + + outs = [] + for sample in range(B): + pos = 0 + out = [] + for pos in range(0, T, wiener_win_len): + frame = slice(pos, pos + wiener_win_len) + z_out = wiener( + mag_out[sample, frame], mix_stft[sample, frame], niters, + residual=residual) + out.append(z_out.transpose(-1, -2)) + outs.append(torch.cat(out, dim=0)) + out = torch.view_as_complex(torch.stack(outs, 0)) + out = out.permute(0, 4, 3, 2, 1).contiguous() + if residual: + out = out[:, :-1] + assert list(out.shape) == [B, S, C, Fq, T] + return out.to(init) + + def forward(self, mix): + x = mix + length = x.shape[-1] + + z = self._spec(mix) + mag = self._magnitude(z).to(mix.device) + x = mag + + B, C, Fq, T = x.shape + + # unlike previous Demucs, we always normalize because it is easier. + mean = x.mean(dim=(1, 2, 3), keepdim=True) + std = x.std(dim=(1, 2, 3), keepdim=True) + x = (x - mean) / (1e-5 + std) + # x will be the freq. branch input. + + if self.hybrid: + # Prepare the time branch input. + xt = mix + meant = xt.mean(dim=(1, 2), keepdim=True) + stdt = xt.std(dim=(1, 2), keepdim=True) + xt = (xt - meant) / (1e-5 + stdt) + + # okay, this is a giant mess I know... + saved = [] # skip connections, freq. + saved_t = [] # skip connections, time. + lengths = [] # saved lengths to properly remove padding, freq branch. + lengths_t = [] # saved lengths for time branch. + for idx, encode in enumerate(self.encoder): + lengths.append(x.shape[-1]) + inject = None + if self.hybrid and idx < len(self.tencoder): + # we have not yet merged branches. + lengths_t.append(xt.shape[-1]) + tenc = self.tencoder[idx] + xt = tenc(xt) + if not tenc.empty: + # save for skip connection + saved_t.append(xt) + else: + # tenc contains just the first conv., so that now time and freq. + # branches have the same shape and can be merged. + inject = xt + x = encode(x, inject) + if idx == 0 and self.freq_emb is not None: + # add frequency embedding to allow for non equivariant convolutions + # over the frequency axis. + frs = torch.arange(x.shape[-2], device=x.device) + emb = self.freq_emb(frs).t()[None, :, :, None].expand_as(x) + x = x + self.freq_emb_scale * emb + + saved.append(x) + + x = torch.zeros_like(x) + if self.hybrid: + xt = torch.zeros_like(x) + # initialize everything to zero (signal will go through u-net skips). + + for idx, decode in enumerate(self.decoder): + skip = saved.pop(-1) + x, pre = decode(x, skip, lengths.pop(-1)) + # `pre` contains the output just before final transposed convolution, + # which is used when the freq. and time branch separate. + + if self.hybrid: + offset = self.depth - len(self.tdecoder) + if self.hybrid and idx >= offset: + tdec = self.tdecoder[idx - offset] + length_t = lengths_t.pop(-1) + if tdec.empty: + assert pre.shape[2] == 1, pre.shape + pre = pre[:, :, 0] + xt, _ = tdec(pre, None, length_t) + else: + skip = saved_t.pop(-1) + xt, _ = tdec(xt, skip, length_t) + + # Let's make sure we used all stored skip connections. + assert len(saved) == 0 + assert len(lengths_t) == 0 + assert len(saved_t) == 0 + + S = len(self.sources) + x = x.view(B, S, -1, Fq, T) + x = x * std[:, None] + mean[:, None] + + # to cpu as non-cuda GPUs don't support complex numbers + # demucs issue #435 ##432 + # NOTE: in this case z already is on cpu + # TODO: remove this when mps supports complex numbers + + device_type = x.device.type + device_load = f"{device_type}:{x.device.index}" if not device_type == 'mps' else device_type + x_is_other_gpu = not device_type in ["cuda", "cpu"] + + if x_is_other_gpu: + x = x.cpu() + + zout = self._mask(z, x) + x = self._ispec(zout, length) + + # back to other device + if x_is_other_gpu: + x = x.to(device_load) + + if self.hybrid: + xt = xt.view(B, S, -1, length) + xt = xt * stdt[:, None] + meant[:, None] + x = xt + x + return x \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/demucs/htdemucs.py b/audio_separator/separator/uvr_lib_v5/demucs/htdemucs.py new file mode 100644 index 0000000..bad6b11 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/demucs/htdemucs.py @@ -0,0 +1,664 @@ +# Copyright (c) Meta, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# First author is Simon Rouard. +""" +This code contains the spectrogram and Hybrid version of Demucs. +""" +import math + +from .filtering import wiener +import torch +from torch import nn +from torch.nn import functional as F +from fractions import Fraction +from einops import rearrange + +from .transformer import CrossTransformerEncoder + +from .demucs import rescale_module +from .states import capture_init +from .spec import spectro, ispectro +from .hdemucs import pad1d, ScaledEmbedding, HEncLayer, MultiWrap, HDecLayer + + +class HTDemucs(nn.Module): + """ + Spectrogram and hybrid Demucs model. + The spectrogram model has the same structure as Demucs, except the first few layers are over the + frequency axis, until there is only 1 frequency, and then it moves to time convolutions. + Frequency layers can still access information across time steps thanks to the DConv residual. + + Hybrid model have a parallel time branch. At some layer, the time branch has the same stride + as the frequency branch and then the two are combined. The opposite happens in the decoder. + + Models can either use naive iSTFT from masking, Wiener filtering ([Ulhih et al. 2017]), + or complex as channels (CaC) [Choi et al. 2020]. Wiener filtering is based on + Open Unmix implementation [Stoter et al. 2019]. + + The loss is always on the temporal domain, by backpropagating through the above + output methods and iSTFT. This allows to define hybrid models nicely. However, this breaks + a bit Wiener filtering, as doing more iteration at test time will change the spectrogram + contribution, without changing the one from the waveform, which will lead to worse performance. + I tried using the residual option in OpenUnmix Wiener implementation, but it didn't improve. + CaC on the other hand provides similar performance for hybrid, and works naturally with + hybrid models. + + This model also uses frequency embeddings are used to improve efficiency on convolutions + over the freq. axis, following [Isik et al. 2020] (https://arxiv.org/pdf/2008.04470.pdf). + + Unlike classic Demucs, there is no resampling here, and normalization is always applied. + """ + + @capture_init + def __init__( + self, + sources, + # Channels + audio_channels=2, + channels=48, + channels_time=None, + growth=2, + # STFT + nfft=4096, + wiener_iters=0, + end_iters=0, + wiener_residual=False, + cac=True, + # Main structure + depth=4, + rewrite=True, + # Frequency branch + multi_freqs=None, + multi_freqs_depth=3, + freq_emb=0.2, + emb_scale=10, + emb_smooth=True, + # Convolutions + kernel_size=8, + time_stride=2, + stride=4, + context=1, + context_enc=0, + # Normalization + norm_starts=4, + norm_groups=4, + # DConv residual branch + dconv_mode=1, + dconv_depth=2, + dconv_comp=8, + dconv_init=1e-3, + # Before the Transformer + bottom_channels=0, + # Transformer + t_layers=5, + t_emb="sin", + t_hidden_scale=4.0, + t_heads=8, + t_dropout=0.0, + t_max_positions=10000, + t_norm_in=True, + t_norm_in_group=False, + t_group_norm=False, + t_norm_first=True, + t_norm_out=True, + t_max_period=10000.0, + t_weight_decay=0.0, + t_lr=None, + t_layer_scale=True, + t_gelu=True, + t_weight_pos_embed=1.0, + t_sin_random_shift=0, + t_cape_mean_normalize=True, + t_cape_augment=True, + t_cape_glob_loc_scale=[5000.0, 1.0, 1.4], + t_sparse_self_attn=False, + t_sparse_cross_attn=False, + t_mask_type="diag", + t_mask_random_seed=42, + t_sparse_attn_window=500, + t_global_window=100, + t_sparsity=0.95, + t_auto_sparsity=False, + # ------ Particuliar parameters + t_cross_first=False, + # Weight init + rescale=0.1, + # Metadata + samplerate=44100, + segment=10, + use_train_segment=True, + ): + """ + Args: + sources (list[str]): list of source names. + audio_channels (int): input/output audio channels. + channels (int): initial number of hidden channels. + channels_time: if not None, use a different `channels` value for the time branch. + growth: increase the number of hidden channels by this factor at each layer. + nfft: number of fft bins. Note that changing this require careful computation of + various shape parameters and will not work out of the box for hybrid models. + wiener_iters: when using Wiener filtering, number of iterations at test time. + end_iters: same but at train time. For a hybrid model, must be equal to `wiener_iters`. + wiener_residual: add residual source before wiener filtering. + cac: uses complex as channels, i.e. complex numbers are 2 channels each + in input and output. no further processing is done before ISTFT. + depth (int): number of layers in the encoder and in the decoder. + rewrite (bool): add 1x1 convolution to each layer. + multi_freqs: list of frequency ratios for splitting frequency bands with `MultiWrap`. + multi_freqs_depth: how many layers to wrap with `MultiWrap`. Only the outermost + layers will be wrapped. + freq_emb: add frequency embedding after the first frequency layer if > 0, + the actual value controls the weight of the embedding. + emb_scale: equivalent to scaling the embedding learning rate + emb_smooth: initialize the embedding with a smooth one (with respect to frequencies). + kernel_size: kernel_size for encoder and decoder layers. + stride: stride for encoder and decoder layers. + time_stride: stride for the final time layer, after the merge. + context: context for 1x1 conv in the decoder. + context_enc: context for 1x1 conv in the encoder. + norm_starts: layer at which group norm starts being used. + decoder layers are numbered in reverse order. + norm_groups: number of groups for group norm. + dconv_mode: if 1: dconv in encoder only, 2: decoder only, 3: both. + dconv_depth: depth of residual DConv branch. + dconv_comp: compression of DConv branch. + dconv_attn: adds attention layers in DConv branch starting at this layer. + dconv_lstm: adds a LSTM layer in DConv branch starting at this layer. + dconv_init: initial scale for the DConv branch LayerScale. + bottom_channels: if >0 it adds a linear layer (1x1 Conv) before and after the + transformer in order to change the number of channels + t_layers: number of layers in each branch (waveform and spec) of the transformer + t_emb: "sin", "cape" or "scaled" + t_hidden_scale: the hidden scale of the Feedforward parts of the transformer + for instance if C = 384 (the number of channels in the transformer) and + t_hidden_scale = 4.0 then the intermediate layer of the FFN has dimension + 384 * 4 = 1536 + t_heads: number of heads for the transformer + t_dropout: dropout in the transformer + t_max_positions: max_positions for the "scaled" positional embedding, only + useful if t_emb="scaled" + t_norm_in: (bool) norm before addinf positional embedding and getting into the + transformer layers + t_norm_in_group: (bool) if True while t_norm_in=True, the norm is on all the + timesteps (GroupNorm with group=1) + t_group_norm: (bool) if True, the norms of the Encoder Layers are on all the + timesteps (GroupNorm with group=1) + t_norm_first: (bool) if True the norm is before the attention and before the FFN + t_norm_out: (bool) if True, there is a GroupNorm (group=1) at the end of each layer + t_max_period: (float) denominator in the sinusoidal embedding expression + t_weight_decay: (float) weight decay for the transformer + t_lr: (float) specific learning rate for the transformer + t_layer_scale: (bool) Layer Scale for the transformer + t_gelu: (bool) activations of the transformer are GeLU if True, ReLU else + t_weight_pos_embed: (float) weighting of the positional embedding + t_cape_mean_normalize: (bool) if t_emb="cape", normalisation of positional embeddings + see: https://arxiv.org/abs/2106.03143 + t_cape_augment: (bool) if t_emb="cape", must be True during training and False + during the inference, see: https://arxiv.org/abs/2106.03143 + t_cape_glob_loc_scale: (list of 3 floats) if t_emb="cape", CAPE parameters + see: https://arxiv.org/abs/2106.03143 + t_sparse_self_attn: (bool) if True, the self attentions are sparse + t_sparse_cross_attn: (bool) if True, the cross-attentions are sparse (don't use it + unless you designed really specific masks) + t_mask_type: (str) can be "diag", "jmask", "random", "global" or any combination + with '_' between: i.e. "diag_jmask_random" (note that this is permutation + invariant i.e. "diag_jmask_random" is equivalent to "jmask_random_diag") + t_mask_random_seed: (int) if "random" is in t_mask_type, controls the seed + that generated the random part of the mask + t_sparse_attn_window: (int) if "diag" is in t_mask_type, for a query (i), and + a key (j), the mask is True id |i-j|<=t_sparse_attn_window + t_global_window: (int) if "global" is in t_mask_type, mask[:t_global_window, :] + and mask[:, :t_global_window] will be True + t_sparsity: (float) if "random" is in t_mask_type, t_sparsity is the sparsity + level of the random part of the mask. + t_cross_first: (bool) if True cross attention is the first layer of the + transformer (False seems to be better) + rescale: weight rescaling trick + use_train_segment: (bool) if True, the actual size that is used during the + training is used during inference. + """ + super().__init__() + self.cac = cac + self.wiener_residual = wiener_residual + self.audio_channels = audio_channels + self.sources = sources + self.kernel_size = kernel_size + self.context = context + self.stride = stride + self.depth = depth + self.bottom_channels = bottom_channels + self.channels = channels + self.samplerate = samplerate + self.segment = segment + self.use_train_segment = use_train_segment + self.nfft = nfft + self.hop_length = nfft // 4 + self.wiener_iters = wiener_iters + self.end_iters = end_iters + self.freq_emb = None + assert wiener_iters == end_iters + + self.encoder = nn.ModuleList() + self.decoder = nn.ModuleList() + + self.tencoder = nn.ModuleList() + self.tdecoder = nn.ModuleList() + + chin = audio_channels + chin_z = chin # number of channels for the freq branch + if self.cac: + chin_z *= 2 + chout = channels_time or channels + chout_z = channels + freqs = nfft // 2 + + for index in range(depth): + norm = index >= norm_starts + freq = freqs > 1 + stri = stride + ker = kernel_size + if not freq: + assert freqs == 1 + ker = time_stride * 2 + stri = time_stride + + pad = True + last_freq = False + if freq and freqs <= kernel_size: + ker = freqs + pad = False + last_freq = True + + kw = { + "kernel_size": ker, + "stride": stri, + "freq": freq, + "pad": pad, + "norm": norm, + "rewrite": rewrite, + "norm_groups": norm_groups, + "dconv_kw": { + "depth": dconv_depth, + "compress": dconv_comp, + "init": dconv_init, + "gelu": True, + }, + } + kwt = dict(kw) + kwt["freq"] = 0 + kwt["kernel_size"] = kernel_size + kwt["stride"] = stride + kwt["pad"] = True + kw_dec = dict(kw) + multi = False + if multi_freqs and index < multi_freqs_depth: + multi = True + kw_dec["context_freq"] = False + + if last_freq: + chout_z = max(chout, chout_z) + chout = chout_z + + enc = HEncLayer( + chin_z, chout_z, dconv=dconv_mode & 1, context=context_enc, **kw + ) + if freq: + tenc = HEncLayer( + chin, + chout, + dconv=dconv_mode & 1, + context=context_enc, + empty=last_freq, + **kwt + ) + self.tencoder.append(tenc) + + if multi: + enc = MultiWrap(enc, multi_freqs) + self.encoder.append(enc) + if index == 0: + chin = self.audio_channels * len(self.sources) + chin_z = chin + if self.cac: + chin_z *= 2 + dec = HDecLayer( + chout_z, + chin_z, + dconv=dconv_mode & 2, + last=index == 0, + context=context, + **kw_dec + ) + if multi: + dec = MultiWrap(dec, multi_freqs) + if freq: + tdec = HDecLayer( + chout, + chin, + dconv=dconv_mode & 2, + empty=last_freq, + last=index == 0, + context=context, + **kwt + ) + self.tdecoder.insert(0, tdec) + self.decoder.insert(0, dec) + + chin = chout + chin_z = chout_z + chout = int(growth * chout) + chout_z = int(growth * chout_z) + if freq: + if freqs <= kernel_size: + freqs = 1 + else: + freqs //= stride + if index == 0 and freq_emb: + self.freq_emb = ScaledEmbedding( + freqs, chin_z, smooth=emb_smooth, scale=emb_scale + ) + self.freq_emb_scale = freq_emb + + if rescale: + rescale_module(self, reference=rescale) + + transformer_channels = channels * growth ** (depth - 1) + if bottom_channels: + self.channel_upsampler = nn.Conv1d(transformer_channels, bottom_channels, 1) + self.channel_downsampler = nn.Conv1d( + bottom_channels, transformer_channels, 1 + ) + self.channel_upsampler_t = nn.Conv1d( + transformer_channels, bottom_channels, 1 + ) + self.channel_downsampler_t = nn.Conv1d( + bottom_channels, transformer_channels, 1 + ) + + transformer_channels = bottom_channels + + if t_layers > 0: + self.crosstransformer = CrossTransformerEncoder( + dim=transformer_channels, + emb=t_emb, + hidden_scale=t_hidden_scale, + num_heads=t_heads, + num_layers=t_layers, + cross_first=t_cross_first, + dropout=t_dropout, + max_positions=t_max_positions, + norm_in=t_norm_in, + norm_in_group=t_norm_in_group, + group_norm=t_group_norm, + norm_first=t_norm_first, + norm_out=t_norm_out, + max_period=t_max_period, + weight_decay=t_weight_decay, + lr=t_lr, + layer_scale=t_layer_scale, + gelu=t_gelu, + sin_random_shift=t_sin_random_shift, + weight_pos_embed=t_weight_pos_embed, + cape_mean_normalize=t_cape_mean_normalize, + cape_augment=t_cape_augment, + cape_glob_loc_scale=t_cape_glob_loc_scale, + sparse_self_attn=t_sparse_self_attn, + sparse_cross_attn=t_sparse_cross_attn, + mask_type=t_mask_type, + mask_random_seed=t_mask_random_seed, + sparse_attn_window=t_sparse_attn_window, + global_window=t_global_window, + sparsity=t_sparsity, + auto_sparsity=t_auto_sparsity, + ) + else: + self.crosstransformer = None + + def _spec(self, x): + hl = self.hop_length + nfft = self.nfft + x0 = x # noqa + + # We re-pad the signal in order to keep the property + # that the size of the output is exactly the size of the input + # divided by the stride (here hop_length), when divisible. + # This is achieved by padding by 1/4th of the kernel size (here nfft). + # which is not supported by torch.stft. + # Having all convolution operations follow this convention allow to easily + # align the time and frequency branches later on. + assert hl == nfft // 4 + le = int(math.ceil(x.shape[-1] / hl)) + pad = hl // 2 * 3 + x = pad1d(x, (pad, pad + le * hl - x.shape[-1]), mode="reflect") + + z = spectro(x, nfft, hl)[..., :-1, :] + assert z.shape[-1] == le + 4, (z.shape, x.shape, le) + z = z[..., 2: 2 + le] + return z + + def _ispec(self, z, length=None, scale=0): + hl = self.hop_length // (4**scale) + z = F.pad(z, (0, 0, 0, 1)) + z = F.pad(z, (2, 2)) + pad = hl // 2 * 3 + le = hl * int(math.ceil(length / hl)) + 2 * pad + x = ispectro(z, hl, length=le) + x = x[..., pad: pad + length] + return x + + def _magnitude(self, z): + # return the magnitude of the spectrogram, except when cac is True, + # in which case we just move the complex dimension to the channel one. + if self.cac: + B, C, Fr, T = z.shape + m = torch.view_as_real(z).permute(0, 1, 4, 2, 3) + m = m.reshape(B, C * 2, Fr, T) + else: + m = z.abs() + return m + + def _mask(self, z, m): + # Apply masking given the mixture spectrogram `z` and the estimated mask `m`. + # If `cac` is True, `m` is actually a full spectrogram and `z` is ignored. + niters = self.wiener_iters + if self.cac: + B, S, C, Fr, T = m.shape + out = m.view(B, S, -1, 2, Fr, T).permute(0, 1, 2, 4, 5, 3) + out = torch.view_as_complex(out.contiguous()) + return out + if self.training: + niters = self.end_iters + if niters < 0: + z = z[:, None] + return z / (1e-8 + z.abs()) * m + else: + return self._wiener(m, z, niters) + + def _wiener(self, mag_out, mix_stft, niters): + # apply wiener filtering from OpenUnmix. + init = mix_stft.dtype + wiener_win_len = 300 + residual = self.wiener_residual + + B, S, C, Fq, T = mag_out.shape + mag_out = mag_out.permute(0, 4, 3, 2, 1) + mix_stft = torch.view_as_real(mix_stft.permute(0, 3, 2, 1)) + + outs = [] + for sample in range(B): + pos = 0 + out = [] + for pos in range(0, T, wiener_win_len): + frame = slice(pos, pos + wiener_win_len) + z_out = wiener( + mag_out[sample, frame], + mix_stft[sample, frame], + niters, + residual=residual, + ) + out.append(z_out.transpose(-1, -2)) + outs.append(torch.cat(out, dim=0)) + out = torch.view_as_complex(torch.stack(outs, 0)) + out = out.permute(0, 4, 3, 2, 1).contiguous() + if residual: + out = out[:, :-1] + assert list(out.shape) == [B, S, C, Fq, T] + return out.to(init) + + def valid_length(self, length: int): + """ + Return a length that is appropriate for evaluation. + In our case, always return the training length, unless + it is smaller than the given length, in which case this + raises an error. + """ + if not self.use_train_segment: + return length + training_length = int(self.segment * self.samplerate) + if training_length < length: + raise ValueError( + f"Given length {length} is longer than " + f"training length {training_length}") + return training_length + + def forward(self, mix): + length = mix.shape[-1] + length_pre_pad = None + if self.use_train_segment: + if self.training: + self.segment = Fraction(mix.shape[-1], self.samplerate) + else: + training_length = int(self.segment * self.samplerate) + if mix.shape[-1] < training_length: + length_pre_pad = mix.shape[-1] + mix = F.pad(mix, (0, training_length - length_pre_pad)) + z = self._spec(mix) + mag = self._magnitude(z).to(mix.device) + x = mag + + B, C, Fq, T = x.shape + + # unlike previous Demucs, we always normalize because it is easier. + mean = x.mean(dim=(1, 2, 3), keepdim=True) + std = x.std(dim=(1, 2, 3), keepdim=True) + x = (x - mean) / (1e-5 + std) + # x will be the freq. branch input. + + # Prepare the time branch input. + xt = mix + meant = xt.mean(dim=(1, 2), keepdim=True) + stdt = xt.std(dim=(1, 2), keepdim=True) + xt = (xt - meant) / (1e-5 + stdt) + + # okay, this is a giant mess I know... + saved = [] # skip connections, freq. + saved_t = [] # skip connections, time. + lengths = [] # saved lengths to properly remove padding, freq branch. + lengths_t = [] # saved lengths for time branch. + for idx, encode in enumerate(self.encoder): + lengths.append(x.shape[-1]) + inject = None + if idx < len(self.tencoder): + # we have not yet merged branches. + lengths_t.append(xt.shape[-1]) + tenc = self.tencoder[idx] + xt = tenc(xt) + if not tenc.empty: + # save for skip connection + saved_t.append(xt) + else: + # tenc contains just the first conv., so that now time and freq. + # branches have the same shape and can be merged. + inject = xt + x = encode(x, inject) + if idx == 0 and self.freq_emb is not None: + # add frequency embedding to allow for non equivariant convolutions + # over the frequency axis. + frs = torch.arange(x.shape[-2], device=x.device) + emb = self.freq_emb(frs).t()[None, :, :, None].expand_as(x) + x = x + self.freq_emb_scale * emb + + saved.append(x) + if self.crosstransformer: + if self.bottom_channels: + b, c, f, t = x.shape + x = rearrange(x, "b c f t-> b c (f t)") + x = self.channel_upsampler(x) + x = rearrange(x, "b c (f t)-> b c f t", f=f) + xt = self.channel_upsampler_t(xt) + + x, xt = self.crosstransformer(x, xt) + + if self.bottom_channels: + x = rearrange(x, "b c f t-> b c (f t)") + x = self.channel_downsampler(x) + x = rearrange(x, "b c (f t)-> b c f t", f=f) + xt = self.channel_downsampler_t(xt) + + for idx, decode in enumerate(self.decoder): + skip = saved.pop(-1) + x, pre = decode(x, skip, lengths.pop(-1)) + # `pre` contains the output just before final transposed convolution, + # which is used when the freq. and time branch separate. + + offset = self.depth - len(self.tdecoder) + if idx >= offset: + tdec = self.tdecoder[idx - offset] + length_t = lengths_t.pop(-1) + if tdec.empty: + assert pre.shape[2] == 1, pre.shape + pre = pre[:, :, 0] + xt, _ = tdec(pre, None, length_t) + else: + skip = saved_t.pop(-1) + xt, _ = tdec(xt, skip, length_t) + + # Let's make sure we used all stored skip connections. + assert len(saved) == 0 + assert len(lengths_t) == 0 + assert len(saved_t) == 0 + + S = len(self.sources) + x = x.view(B, S, -1, Fq, T) + x = x * std[:, None] + mean[:, None] + + # to cpu as non-cuda GPUs don't support complex numbers + # demucs issue #435 ##432 + # NOTE: in this case z already is on cpu + # TODO: remove this when mps supports complex numbers + + device_type = x.device.type + device_load = f"{device_type}:{x.device.index}" if not device_type == 'mps' else device_type + x_is_other_gpu = not device_type in ["cuda", "cpu"] + + if x_is_other_gpu: + x = x.cpu() + + zout = self._mask(z, x) + if self.use_train_segment: + if self.training: + x = self._ispec(zout, length) + else: + x = self._ispec(zout, training_length) + else: + x = self._ispec(zout, length) + + # back to other device + if x_is_other_gpu: + x = x.to(device_load) + + if self.use_train_segment: + if self.training: + xt = xt.view(B, S, -1, length) + else: + xt = xt.view(B, S, -1, training_length) + else: + xt = xt.view(B, S, -1, length) + xt = xt * stdt[:, None] + meant[:, None] + x = xt + x + if length_pre_pad: + x = x[..., :length_pre_pad] + return x diff --git a/audio_separator/separator/uvr_lib_v5/demucs/model.py b/audio_separator/separator/uvr_lib_v5/demucs/model.py new file mode 100644 index 0000000..e2745b8 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/demucs/model.py @@ -0,0 +1,218 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch as th +from torch import nn + +from .utils import capture_init, center_trim + + +class BLSTM(nn.Module): + def __init__(self, dim, layers=1): + super().__init__() + self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim) + self.linear = nn.Linear(2 * dim, dim) + + def forward(self, x): + x = x.permute(2, 0, 1) + x = self.lstm(x)[0] + x = self.linear(x) + x = x.permute(1, 2, 0) + return x + + +def rescale_conv(conv, reference): + std = conv.weight.std().detach() + scale = (std / reference)**0.5 + conv.weight.data /= scale + if conv.bias is not None: + conv.bias.data /= scale + + +def rescale_module(module, reference): + for sub in module.modules(): + if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)): + rescale_conv(sub, reference) + + +def upsample(x, stride): + """ + Linear upsampling, the output will be `stride` times longer. + """ + batch, channels, time = x.size() + weight = th.arange(stride, device=x.device, dtype=th.float) / stride + x = x.view(batch, channels, time, 1) + out = x[..., :-1, :] * (1 - weight) + x[..., 1:, :] * weight + return out.reshape(batch, channels, -1) + + +def downsample(x, stride): + """ + Downsample x by decimation. + """ + return x[:, :, ::stride] + + +class Demucs(nn.Module): + @capture_init + def __init__(self, + sources=4, + audio_channels=2, + channels=64, + depth=6, + rewrite=True, + glu=True, + upsample=False, + rescale=0.1, + kernel_size=8, + stride=4, + growth=2., + lstm_layers=2, + context=3, + samplerate=44100): + """ + Args: + sources (int): number of sources to separate + audio_channels (int): stereo or mono + channels (int): first convolution channels + depth (int): number of encoder/decoder layers + rewrite (bool): add 1x1 convolution to each encoder layer + and a convolution to each decoder layer. + For the decoder layer, `context` gives the kernel size. + glu (bool): use glu instead of ReLU + upsample (bool): use linear upsampling with convolutions + Wave-U-Net style, instead of transposed convolutions + rescale (int): rescale initial weights of convolutions + to get their standard deviation closer to `rescale` + kernel_size (int): kernel size for convolutions + stride (int): stride for convolutions + growth (float): multiply (resp divide) number of channels by that + for each layer of the encoder (resp decoder) + lstm_layers (int): number of lstm layers, 0 = no lstm + context (int): kernel size of the convolution in the + decoder before the transposed convolution. If > 1, + will provide some context from neighboring time + steps. + """ + + super().__init__() + self.audio_channels = audio_channels + self.sources = sources + self.kernel_size = kernel_size + self.context = context + self.stride = stride + self.depth = depth + self.upsample = upsample + self.channels = channels + self.samplerate = samplerate + + self.encoder = nn.ModuleList() + self.decoder = nn.ModuleList() + + self.final = None + if upsample: + self.final = nn.Conv1d(channels + audio_channels, sources * audio_channels, 1) + stride = 1 + + if glu: + activation = nn.GLU(dim=1) + ch_scale = 2 + else: + activation = nn.ReLU() + ch_scale = 1 + in_channels = audio_channels + for index in range(depth): + encode = [] + encode += [nn.Conv1d(in_channels, channels, kernel_size, stride), nn.ReLU()] + if rewrite: + encode += [nn.Conv1d(channels, ch_scale * channels, 1), activation] + self.encoder.append(nn.Sequential(*encode)) + + decode = [] + if index > 0: + out_channels = in_channels + else: + if upsample: + out_channels = channels + else: + out_channels = sources * audio_channels + if rewrite: + decode += [nn.Conv1d(channels, ch_scale * channels, context), activation] + if upsample: + decode += [ + nn.Conv1d(channels, out_channels, kernel_size, stride=1), + ] + else: + decode += [nn.ConvTranspose1d(channels, out_channels, kernel_size, stride)] + if index > 0: + decode.append(nn.ReLU()) + self.decoder.insert(0, nn.Sequential(*decode)) + in_channels = channels + channels = int(growth * channels) + + channels = in_channels + + if lstm_layers: + self.lstm = BLSTM(channels, lstm_layers) + else: + self.lstm = None + + if rescale: + rescale_module(self, reference=rescale) + + def valid_length(self, length): + """ + Return the nearest valid length to use with the model so that + there is no time steps left over in a convolutions, e.g. for all + layers, size of the input - kernel_size % stride = 0. + + If the mixture has a valid length, the estimated sources + will have exactly the same length when context = 1. If context > 1, + the two signals can be center trimmed to match. + + For training, extracts should have a valid length.For evaluation + on full tracks we recommend passing `pad = True` to :method:`forward`. + """ + for _ in range(self.depth): + if self.upsample: + length = math.ceil(length / self.stride) + self.kernel_size - 1 + else: + length = math.ceil((length - self.kernel_size) / self.stride) + 1 + length = max(1, length) + length += self.context - 1 + for _ in range(self.depth): + if self.upsample: + length = length * self.stride + self.kernel_size - 1 + else: + length = (length - 1) * self.stride + self.kernel_size + + return int(length) + + def forward(self, mix): + x = mix + saved = [x] + for encode in self.encoder: + x = encode(x) + saved.append(x) + if self.upsample: + x = downsample(x, self.stride) + if self.lstm: + x = self.lstm(x) + for decode in self.decoder: + if self.upsample: + x = upsample(x, stride=self.stride) + skip = center_trim(saved.pop(-1), x) + x = x + skip + x = decode(x) + if self.final: + skip = center_trim(saved.pop(-1), x) + x = th.cat([x, skip], dim=1) + x = self.final(x) + + x = x.view(x.size(0), self.sources, self.audio_channels, x.size(-1)) + return x diff --git a/audio_separator/separator/uvr_lib_v5/demucs/model_v2.py b/audio_separator/separator/uvr_lib_v5/demucs/model_v2.py new file mode 100644 index 0000000..db43fc5 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/demucs/model_v2.py @@ -0,0 +1,218 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import julius +from torch import nn +from .tasnet_v2 import ConvTasNet + +from .utils import capture_init, center_trim + + +class BLSTM(nn.Module): + def __init__(self, dim, layers=1): + super().__init__() + self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim) + self.linear = nn.Linear(2 * dim, dim) + + def forward(self, x): + x = x.permute(2, 0, 1) + x = self.lstm(x)[0] + x = self.linear(x) + x = x.permute(1, 2, 0) + return x + + +def rescale_conv(conv, reference): + std = conv.weight.std().detach() + scale = (std / reference)**0.5 + conv.weight.data /= scale + if conv.bias is not None: + conv.bias.data /= scale + + +def rescale_module(module, reference): + for sub in module.modules(): + if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)): + rescale_conv(sub, reference) + +def auto_load_demucs_model_v2(sources, demucs_model_name): + + if '48' in demucs_model_name: + channels=48 + elif 'unittest' in demucs_model_name: + channels=4 + else: + channels=64 + + if 'tasnet' in demucs_model_name: + init_demucs_model = ConvTasNet(sources, X=10) + else: + init_demucs_model = Demucs(sources, channels=channels) + + return init_demucs_model + +class Demucs(nn.Module): + @capture_init + def __init__(self, + sources, + audio_channels=2, + channels=64, + depth=6, + rewrite=True, + glu=True, + rescale=0.1, + resample=True, + kernel_size=8, + stride=4, + growth=2., + lstm_layers=2, + context=3, + normalize=False, + samplerate=44100, + segment_length=4 * 10 * 44100): + """ + Args: + sources (list[str]): list of source names + audio_channels (int): stereo or mono + channels (int): first convolution channels + depth (int): number of encoder/decoder layers + rewrite (bool): add 1x1 convolution to each encoder layer + and a convolution to each decoder layer. + For the decoder layer, `context` gives the kernel size. + glu (bool): use glu instead of ReLU + resample_input (bool): upsample x2 the input and downsample /2 the output. + rescale (int): rescale initial weights of convolutions + to get their standard deviation closer to `rescale` + kernel_size (int): kernel size for convolutions + stride (int): stride for convolutions + growth (float): multiply (resp divide) number of channels by that + for each layer of the encoder (resp decoder) + lstm_layers (int): number of lstm layers, 0 = no lstm + context (int): kernel size of the convolution in the + decoder before the transposed convolution. If > 1, + will provide some context from neighboring time + steps. + samplerate (int): stored as meta information for easing + future evaluations of the model. + segment_length (int): stored as meta information for easing + future evaluations of the model. Length of the segments on which + the model was trained. + """ + + super().__init__() + self.audio_channels = audio_channels + self.sources = sources + self.kernel_size = kernel_size + self.context = context + self.stride = stride + self.depth = depth + self.resample = resample + self.channels = channels + self.normalize = normalize + self.samplerate = samplerate + self.segment_length = segment_length + + self.encoder = nn.ModuleList() + self.decoder = nn.ModuleList() + + if glu: + activation = nn.GLU(dim=1) + ch_scale = 2 + else: + activation = nn.ReLU() + ch_scale = 1 + in_channels = audio_channels + for index in range(depth): + encode = [] + encode += [nn.Conv1d(in_channels, channels, kernel_size, stride), nn.ReLU()] + if rewrite: + encode += [nn.Conv1d(channels, ch_scale * channels, 1), activation] + self.encoder.append(nn.Sequential(*encode)) + + decode = [] + if index > 0: + out_channels = in_channels + else: + out_channels = len(self.sources) * audio_channels + if rewrite: + decode += [nn.Conv1d(channels, ch_scale * channels, context), activation] + decode += [nn.ConvTranspose1d(channels, out_channels, kernel_size, stride)] + if index > 0: + decode.append(nn.ReLU()) + self.decoder.insert(0, nn.Sequential(*decode)) + in_channels = channels + channels = int(growth * channels) + + channels = in_channels + + if lstm_layers: + self.lstm = BLSTM(channels, lstm_layers) + else: + self.lstm = None + + if rescale: + rescale_module(self, reference=rescale) + + def valid_length(self, length): + """ + Return the nearest valid length to use with the model so that + there is no time steps left over in a convolutions, e.g. for all + layers, size of the input - kernel_size % stride = 0. + + If the mixture has a valid length, the estimated sources + will have exactly the same length when context = 1. If context > 1, + the two signals can be center trimmed to match. + + For training, extracts should have a valid length.For evaluation + on full tracks we recommend passing `pad = True` to :method:`forward`. + """ + if self.resample: + length *= 2 + for _ in range(self.depth): + length = math.ceil((length - self.kernel_size) / self.stride) + 1 + length = max(1, length) + length += self.context - 1 + for _ in range(self.depth): + length = (length - 1) * self.stride + self.kernel_size + + if self.resample: + length = math.ceil(length / 2) + return int(length) + + def forward(self, mix): + x = mix + + if self.normalize: + mono = mix.mean(dim=1, keepdim=True) + mean = mono.mean(dim=-1, keepdim=True) + std = mono.std(dim=-1, keepdim=True) + else: + mean = 0 + std = 1 + + x = (x - mean) / (1e-5 + std) + + if self.resample: + x = julius.resample_frac(x, 1, 2) + + saved = [] + for encode in self.encoder: + x = encode(x) + saved.append(x) + if self.lstm: + x = self.lstm(x) + for decode in self.decoder: + skip = center_trim(saved.pop(-1), x) + x = x + skip + x = decode(x) + + if self.resample: + x = julius.resample_frac(x, 2, 1) + x = x * std + mean + x = x.view(x.size(0), len(self.sources), self.audio_channels, x.size(-1)) + return x diff --git a/audio_separator/separator/uvr_lib_v5/demucs/pretrained.py b/audio_separator/separator/uvr_lib_v5/demucs/pretrained.py new file mode 100644 index 0000000..25aa685 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/demucs/pretrained.py @@ -0,0 +1,180 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +"""Loading pretrained models. +""" + +import logging +from pathlib import Path +import typing as tp + +#from dora.log import fatal + +import logging + +from diffq import DiffQuantizer +import torch.hub + +from .model import Demucs +from .tasnet_v2 import ConvTasNet +from .utils import set_state + +from .hdemucs import HDemucs +from .repo import RemoteRepo, LocalRepo, ModelOnlyRepo, BagOnlyRepo, AnyModelRepo, ModelLoadingError # noqa + +logger = logging.getLogger(__name__) +ROOT_URL = "https://dl.fbaipublicfiles.com/demucs/mdx_final/" +REMOTE_ROOT = Path(__file__).parent / 'remote' + +SOURCES = ["drums", "bass", "other", "vocals"] + + +def demucs_unittest(): + model = HDemucs(channels=4, sources=SOURCES) + return model + + +def add_model_flags(parser): + group = parser.add_mutually_exclusive_group(required=False) + group.add_argument("-s", "--sig", help="Locally trained XP signature.") + group.add_argument("-n", "--name", default="mdx_extra_q", + help="Pretrained model name or signature. Default is mdx_extra_q.") + parser.add_argument("--repo", type=Path, + help="Folder containing all pre-trained models for use with -n.") + + +def _parse_remote_files(remote_file_list) -> tp.Dict[str, str]: + root: str = '' + models: tp.Dict[str, str] = {} + for line in remote_file_list.read_text().split('\n'): + line = line.strip() + if line.startswith('#'): + continue + elif line.startswith('root:'): + root = line.split(':', 1)[1].strip() + else: + sig = line.split('-', 1)[0] + assert sig not in models + models[sig] = ROOT_URL + root + line + return models + +def get_model(name: str, + repo: tp.Optional[Path] = None): + """`name` must be a bag of models name or a pretrained signature + from the remote AWS model repo or the specified local repo if `repo` is not None. + """ + if name == 'demucs_unittest': + return demucs_unittest() + model_repo: ModelOnlyRepo + if repo is None: + models = _parse_remote_files(REMOTE_ROOT / 'files.txt') + model_repo = RemoteRepo(models) + bag_repo = BagOnlyRepo(REMOTE_ROOT, model_repo) + else: + if not repo.is_dir(): + fatal(f"{repo} must exist and be a directory.") + model_repo = LocalRepo(repo) + bag_repo = BagOnlyRepo(repo, model_repo) + any_repo = AnyModelRepo(model_repo, bag_repo) + model = any_repo.get_model(name) + model.eval() + return model + +def get_model_from_args(args): + """ + Load local model package or pre-trained model. + """ + return get_model(name=args.name, repo=args.repo) + +logger = logging.getLogger(__name__) +ROOT = "https://dl.fbaipublicfiles.com/demucs/v3.0/" + +PRETRAINED_MODELS = { + 'demucs': 'e07c671f', + 'demucs48_hq': '28a1282c', + 'demucs_extra': '3646af93', + 'demucs_quantized': '07afea75', + 'tasnet': 'beb46fac', + 'tasnet_extra': 'df3777b2', + 'demucs_unittest': '09ebc15f', +} + +SOURCES = ["drums", "bass", "other", "vocals"] + + +def get_url(name): + sig = PRETRAINED_MODELS[name] + return ROOT + name + "-" + sig[:8] + ".th" + +def is_pretrained(name): + return name in PRETRAINED_MODELS + + +def load_pretrained(name): + if name == "demucs": + return demucs(pretrained=True) + elif name == "demucs48_hq": + return demucs(pretrained=True, hq=True, channels=48) + elif name == "demucs_extra": + return demucs(pretrained=True, extra=True) + elif name == "demucs_quantized": + return demucs(pretrained=True, quantized=True) + elif name == "demucs_unittest": + return demucs_unittest(pretrained=True) + elif name == "tasnet": + return tasnet(pretrained=True) + elif name == "tasnet_extra": + return tasnet(pretrained=True, extra=True) + else: + raise ValueError(f"Invalid pretrained name {name}") + + +def _load_state(name, model, quantizer=None): + url = get_url(name) + state = torch.hub.load_state_dict_from_url(url, map_location='cpu', check_hash=True) + set_state(model, quantizer, state) + if quantizer: + quantizer.detach() + + +def demucs_unittest(pretrained=True): + model = Demucs(channels=4, sources=SOURCES) + if pretrained: + _load_state('demucs_unittest', model) + return model + + +def demucs(pretrained=True, extra=False, quantized=False, hq=False, channels=64): + if not pretrained and (extra or quantized or hq): + raise ValueError("if extra or quantized is True, pretrained must be True.") + model = Demucs(sources=SOURCES, channels=channels) + if pretrained: + name = 'demucs' + if channels != 64: + name += str(channels) + quantizer = None + if sum([extra, quantized, hq]) > 1: + raise ValueError("Only one of extra, quantized, hq, can be True.") + if quantized: + quantizer = DiffQuantizer(model, group_size=8, min_size=1) + name += '_quantized' + if extra: + name += '_extra' + if hq: + name += '_hq' + _load_state(name, model, quantizer) + return model + + +def tasnet(pretrained=True, extra=False): + if not pretrained and extra: + raise ValueError("if extra is True, pretrained must be True.") + model = ConvTasNet(X=10, sources=SOURCES) + if pretrained: + name = 'tasnet' + if extra: + name = 'tasnet_extra' + _load_state(name, model) + return model \ No newline at end of file diff --git a/audio_separator/separator/uvr_lib_v5/demucs/repo.py b/audio_separator/separator/uvr_lib_v5/demucs/repo.py new file mode 100644 index 0000000..65ff6b3 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/demucs/repo.py @@ -0,0 +1,148 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +"""Represents a model repository, including pre-trained models and bags of models. +A repo can either be the main remote repository stored in AWS, or a local repository +with your own models. +""" + +from hashlib import sha256 +from pathlib import Path +import typing as tp + +import torch +import yaml + +from .apply import BagOfModels, Model +from .states import load_model + + +AnyModel = tp.Union[Model, BagOfModels] + + +class ModelLoadingError(RuntimeError): + pass + + +def check_checksum(path: Path, checksum: str): + sha = sha256() + with open(path, 'rb') as file: + while True: + buf = file.read(2**20) + if not buf: + break + sha.update(buf) + actual_checksum = sha.hexdigest()[:len(checksum)] + if actual_checksum != checksum: + raise ModelLoadingError(f'Invalid checksum for file {path}, ' + f'expected {checksum} but got {actual_checksum}') + +class ModelOnlyRepo: + """Base class for all model only repos. + """ + def has_model(self, sig: str) -> bool: + raise NotImplementedError() + + def get_model(self, sig: str) -> Model: + raise NotImplementedError() + + +class RemoteRepo(ModelOnlyRepo): + def __init__(self, models: tp.Dict[str, str]): + self._models = models + + def has_model(self, sig: str) -> bool: + return sig in self._models + + def get_model(self, sig: str) -> Model: + try: + url = self._models[sig] + except KeyError: + raise ModelLoadingError(f'Could not find a pre-trained model with signature {sig}.') + pkg = torch.hub.load_state_dict_from_url(url, map_location='cpu', check_hash=True) + return load_model(pkg) + + +class LocalRepo(ModelOnlyRepo): + def __init__(self, root: Path): + self.root = root + self.scan() + + def scan(self): + self._models = {} + self._checksums = {} + for file in self.root.iterdir(): + if file.suffix == '.th': + if '-' in file.stem: + xp_sig, checksum = file.stem.split('-') + self._checksums[xp_sig] = checksum + else: + xp_sig = file.stem + if xp_sig in self._models: + print('Whats xp? ', xp_sig) + raise ModelLoadingError( + f'Duplicate pre-trained model exist for signature {xp_sig}. ' + 'Please delete all but one.') + self._models[xp_sig] = file + + def has_model(self, sig: str) -> bool: + return sig in self._models + + def get_model(self, sig: str) -> Model: + try: + file = self._models[sig] + except KeyError: + raise ModelLoadingError(f'Could not find pre-trained model with signature {sig}.') + if sig in self._checksums: + check_checksum(file, self._checksums[sig]) + return load_model(file) + + +class BagOnlyRepo: + """Handles only YAML files containing bag of models, leaving the actual + model loading to some Repo. + """ + def __init__(self, root: Path, model_repo: ModelOnlyRepo): + self.root = root + self.model_repo = model_repo + self.scan() + + def scan(self): + self._bags = {} + for file in self.root.iterdir(): + if file.suffix == '.yaml': + self._bags[file.stem] = file + + def has_model(self, name: str) -> bool: + return name in self._bags + + def get_model(self, name: str) -> BagOfModels: + try: + yaml_file = self._bags[name] + except KeyError: + raise ModelLoadingError(f'{name} is neither a single pre-trained model or ' + 'a bag of models.') + bag = yaml.safe_load(open(yaml_file)) + signatures = bag['models'] + models = [self.model_repo.get_model(sig) for sig in signatures] + weights = bag.get('weights') + segment = bag.get('segment') + return BagOfModels(models, weights, segment) + + +class AnyModelRepo: + def __init__(self, model_repo: ModelOnlyRepo, bag_repo: BagOnlyRepo): + self.model_repo = model_repo + self.bag_repo = bag_repo + + def has_model(self, name_or_sig: str) -> bool: + return self.model_repo.has_model(name_or_sig) or self.bag_repo.has_model(name_or_sig) + + def get_model(self, name_or_sig: str) -> AnyModel: + print('name_or_sig: ', name_or_sig) + if self.model_repo.has_model(name_or_sig): + return self.model_repo.get_model(name_or_sig) + else: + return self.bag_repo.get_model(name_or_sig) diff --git a/audio_separator/separator/uvr_lib_v5/demucs/spec.py b/audio_separator/separator/uvr_lib_v5/demucs/spec.py new file mode 100644 index 0000000..9c79cb6 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/demucs/spec.py @@ -0,0 +1,53 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +"""Conveniance wrapper to perform STFT and iSTFT""" + +import torch as th + + +def spectro(x, n_fft=512, hop_length=None, pad=0): + *other, length = x.shape + x = x.reshape(-1, length) + + device_type = x.device.type + is_other_gpu = not device_type in ["cuda", "cpu"] + + if is_other_gpu: + x = x.cpu() + z = th.stft(x, + n_fft * (1 + pad), + hop_length or n_fft // 4, + window=th.hann_window(n_fft).to(x), + win_length=n_fft, + normalized=True, + center=True, + return_complex=True, + pad_mode='reflect') + _, freqs, frame = z.shape + return z.view(*other, freqs, frame) + + +def ispectro(z, hop_length=None, length=None, pad=0): + *other, freqs, frames = z.shape + n_fft = 2 * freqs - 2 + z = z.view(-1, freqs, frames) + win_length = n_fft // (1 + pad) + + device_type = z.device.type + is_other_gpu = not device_type in ["cuda", "cpu"] + + if is_other_gpu: + z = z.cpu() + x = th.istft(z, + n_fft, + hop_length, + window=th.hann_window(win_length).to(z.real), + win_length=win_length, + normalized=True, + length=length, + center=True) + _, length = x.shape + return x.view(*other, length) diff --git a/audio_separator/separator/uvr_lib_v5/demucs/states.py b/audio_separator/separator/uvr_lib_v5/demucs/states.py new file mode 100644 index 0000000..db17a18 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/demucs/states.py @@ -0,0 +1,148 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +""" +Utilities to save and load models. +""" +from contextlib import contextmanager + +import functools +import hashlib +import inspect +import io +from pathlib import Path +import warnings + +from omegaconf import OmegaConf +from diffq import DiffQuantizer, UniformQuantizer, restore_quantized_state +import torch + + +def get_quantizer(model, args, optimizer=None): + """Return the quantizer given the XP quantization args.""" + quantizer = None + if args.diffq: + quantizer = DiffQuantizer( + model, min_size=args.min_size, group_size=args.group_size) + if optimizer is not None: + quantizer.setup_optimizer(optimizer) + elif args.qat: + quantizer = UniformQuantizer( + model, bits=args.qat, min_size=args.min_size) + return quantizer + + +def load_model(path_or_package, strict=False): + """Load a model from the given serialized model, either given as a dict (already loaded) + or a path to a file on disk.""" + if isinstance(path_or_package, dict): + package = path_or_package + elif isinstance(path_or_package, (str, Path)): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + path = path_or_package + package = torch.load(path, 'cpu') + else: + raise ValueError(f"Invalid type for {path_or_package}.") + + klass = package["klass"] + args = package["args"] + kwargs = package["kwargs"] + + if strict: + model = klass(*args, **kwargs) + else: + sig = inspect.signature(klass) + for key in list(kwargs): + if key not in sig.parameters: + warnings.warn("Dropping inexistant parameter " + key) + del kwargs[key] + model = klass(*args, **kwargs) + + state = package["state"] + + set_state(model, state) + return model + + +def get_state(model, quantizer, half=False): + """Get the state from a model, potentially with quantization applied. + If `half` is True, model are stored as half precision, which shouldn't impact performance + but half the state size.""" + if quantizer is None: + dtype = torch.half if half else None + state = {k: p.data.to(device='cpu', dtype=dtype) for k, p in model.state_dict().items()} + else: + state = quantizer.get_quantized_state() + state['__quantized'] = True + return state + + +def set_state(model, state, quantizer=None): + """Set the state on a given model.""" + if state.get('__quantized'): + if quantizer is not None: + quantizer.restore_quantized_state(model, state['quantized']) + else: + restore_quantized_state(model, state) + else: + model.load_state_dict(state) + return state + + +def save_with_checksum(content, path): + """Save the given value on disk, along with a sha256 hash. + Should be used with the output of either `serialize_model` or `get_state`.""" + buf = io.BytesIO() + torch.save(content, buf) + sig = hashlib.sha256(buf.getvalue()).hexdigest()[:8] + + path = path.parent / (path.stem + "-" + sig + path.suffix) + path.write_bytes(buf.getvalue()) + + +def serialize_model(model, training_args, quantizer=None, half=True): + args, kwargs = model._init_args_kwargs + klass = model.__class__ + + state = get_state(model, quantizer, half) + return { + 'klass': klass, + 'args': args, + 'kwargs': kwargs, + 'state': state, + 'training_args': OmegaConf.to_container(training_args, resolve=True), + } + + +def copy_state(state): + return {k: v.cpu().clone() for k, v in state.items()} + + +@contextmanager +def swap_state(model, state): + """ + Context manager that swaps the state of a model, e.g: + + # model is in old state + with swap_state(model, new_state): + # model in new state + # model back to old state + """ + old_state = copy_state(model.state_dict()) + model.load_state_dict(state, strict=False) + try: + yield + finally: + model.load_state_dict(old_state) + + +def capture_init(init): + @functools.wraps(init) + def __init__(self, *args, **kwargs): + self._init_args_kwargs = (args, kwargs) + init(self, *args, **kwargs) + + return __init__ diff --git a/audio_separator/separator/uvr_lib_v5/demucs/tasnet.py b/audio_separator/separator/uvr_lib_v5/demucs/tasnet.py new file mode 100644 index 0000000..9cb7a95 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/demucs/tasnet.py @@ -0,0 +1,447 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# +# Created on 2018/12 +# Author: Kaituo XU +# Modified on 2019/11 by Alexandre Defossez, added support for multiple output channels +# Here is the original license: +# The MIT License (MIT) +# +# Copyright (c) 2018 Kaituo XU +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .utils import capture_init + +EPS = 1e-8 + + +def overlap_and_add(signal, frame_step): + outer_dimensions = signal.size()[:-2] + frames, frame_length = signal.size()[-2:] + + subframe_length = math.gcd(frame_length, frame_step) # gcd=Greatest Common Divisor + subframe_step = frame_step // subframe_length + subframes_per_frame = frame_length // subframe_length + output_size = frame_step * (frames - 1) + frame_length + output_subframes = output_size // subframe_length + + subframe_signal = signal.view(*outer_dimensions, -1, subframe_length) + + frame = torch.arange(0, output_subframes, + device=signal.device).unfold(0, subframes_per_frame, subframe_step) + frame = frame.long() # signal may in GPU or CPU + frame = frame.contiguous().view(-1) + + result = signal.new_zeros(*outer_dimensions, output_subframes, subframe_length) + result.index_add_(-2, frame, subframe_signal) + result = result.view(*outer_dimensions, -1) + return result + + +class ConvTasNet(nn.Module): + @capture_init + def __init__(self, + N=256, + L=20, + B=256, + H=512, + P=3, + X=8, + R=4, + C=4, + audio_channels=1, + samplerate=44100, + norm_type="gLN", + causal=False, + mask_nonlinear='relu'): + """ + Args: + N: Number of filters in autoencoder + L: Length of the filters (in samples) + B: Number of channels in bottleneck 1 × 1-conv block + H: Number of channels in convolutional blocks + P: Kernel size in convolutional blocks + X: Number of convolutional blocks in each repeat + R: Number of repeats + C: Number of speakers + norm_type: BN, gLN, cLN + causal: causal or non-causal + mask_nonlinear: use which non-linear function to generate mask + """ + super(ConvTasNet, self).__init__() + # Hyper-parameter + self.N, self.L, self.B, self.H, self.P, self.X, self.R, self.C = N, L, B, H, P, X, R, C + self.norm_type = norm_type + self.causal = causal + self.mask_nonlinear = mask_nonlinear + self.audio_channels = audio_channels + self.samplerate = samplerate + # Components + self.encoder = Encoder(L, N, audio_channels) + self.separator = TemporalConvNet(N, B, H, P, X, R, C, norm_type, causal, mask_nonlinear) + self.decoder = Decoder(N, L, audio_channels) + # init + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_normal_(p) + + def valid_length(self, length): + return length + + def forward(self, mixture): + """ + Args: + mixture: [M, T], M is batch size, T is #samples + Returns: + est_source: [M, C, T] + """ + mixture_w = self.encoder(mixture) + est_mask = self.separator(mixture_w) + est_source = self.decoder(mixture_w, est_mask) + + # T changed after conv1d in encoder, fix it here + T_origin = mixture.size(-1) + T_conv = est_source.size(-1) + est_source = F.pad(est_source, (0, T_origin - T_conv)) + return est_source + + +class Encoder(nn.Module): + """Estimation of the nonnegative mixture weight by a 1-D conv layer. + """ + def __init__(self, L, N, audio_channels): + super(Encoder, self).__init__() + # Hyper-parameter + self.L, self.N = L, N + # Components + # 50% overlap + self.conv1d_U = nn.Conv1d(audio_channels, N, kernel_size=L, stride=L // 2, bias=False) + + def forward(self, mixture): + """ + Args: + mixture: [M, T], M is batch size, T is #samples + Returns: + mixture_w: [M, N, K], where K = (T-L)/(L/2)+1 = 2T/L-1 + """ + mixture_w = F.relu(self.conv1d_U(mixture)) # [M, N, K] + return mixture_w + + +class Decoder(nn.Module): + def __init__(self, N, L, audio_channels): + super(Decoder, self).__init__() + # Hyper-parameter + self.N, self.L = N, L + self.audio_channels = audio_channels + # Components + self.basis_signals = nn.Linear(N, audio_channels * L, bias=False) + + def forward(self, mixture_w, est_mask): + """ + Args: + mixture_w: [M, N, K] + est_mask: [M, C, N, K] + Returns: + est_source: [M, C, T] + """ + # D = W * M + source_w = torch.unsqueeze(mixture_w, 1) * est_mask # [M, C, N, K] + source_w = torch.transpose(source_w, 2, 3) # [M, C, K, N] + # S = DV + est_source = self.basis_signals(source_w) # [M, C, K, ac * L] + m, c, k, _ = est_source.size() + est_source = est_source.view(m, c, k, self.audio_channels, -1).transpose(2, 3).contiguous() + est_source = overlap_and_add(est_source, self.L // 2) # M x C x ac x T + return est_source + + +class TemporalConvNet(nn.Module): + def __init__(self, N, B, H, P, X, R, C, norm_type="gLN", causal=False, mask_nonlinear='relu'): + """ + Args: + N: Number of filters in autoencoder + B: Number of channels in bottleneck 1 × 1-conv block + H: Number of channels in convolutional blocks + P: Kernel size in convolutional blocks + X: Number of convolutional blocks in each repeat + R: Number of repeats + C: Number of speakers + norm_type: BN, gLN, cLN + causal: causal or non-causal + mask_nonlinear: use which non-linear function to generate mask + """ + super(TemporalConvNet, self).__init__() + # Hyper-parameter + self.C = C + self.mask_nonlinear = mask_nonlinear + # Components + # [M, N, K] -> [M, N, K] + layer_norm = ChannelwiseLayerNorm(N) + # [M, N, K] -> [M, B, K] + bottleneck_conv1x1 = nn.Conv1d(N, B, 1, bias=False) + # [M, B, K] -> [M, B, K] + repeats = [] + for r in range(R): + blocks = [] + for x in range(X): + dilation = 2**x + padding = (P - 1) * dilation if causal else (P - 1) * dilation // 2 + blocks += [ + TemporalBlock(B, + H, + P, + stride=1, + padding=padding, + dilation=dilation, + norm_type=norm_type, + causal=causal) + ] + repeats += [nn.Sequential(*blocks)] + temporal_conv_net = nn.Sequential(*repeats) + # [M, B, K] -> [M, C*N, K] + mask_conv1x1 = nn.Conv1d(B, C * N, 1, bias=False) + # Put together + self.network = nn.Sequential(layer_norm, bottleneck_conv1x1, temporal_conv_net, + mask_conv1x1) + + def forward(self, mixture_w): + """ + Keep this API same with TasNet + Args: + mixture_w: [M, N, K], M is batch size + returns: + est_mask: [M, C, N, K] + """ + M, N, K = mixture_w.size() + score = self.network(mixture_w) # [M, N, K] -> [M, C*N, K] + score = score.view(M, self.C, N, K) # [M, C*N, K] -> [M, C, N, K] + if self.mask_nonlinear == 'softmax': + est_mask = F.softmax(score, dim=1) + elif self.mask_nonlinear == 'relu': + est_mask = F.relu(score) + else: + raise ValueError("Unsupported mask non-linear function") + return est_mask + + +class TemporalBlock(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + norm_type="gLN", + causal=False): + super(TemporalBlock, self).__init__() + # [M, B, K] -> [M, H, K] + conv1x1 = nn.Conv1d(in_channels, out_channels, 1, bias=False) + prelu = nn.PReLU() + norm = chose_norm(norm_type, out_channels) + # [M, H, K] -> [M, B, K] + dsconv = DepthwiseSeparableConv(out_channels, in_channels, kernel_size, stride, padding, + dilation, norm_type, causal) + # Put together + self.net = nn.Sequential(conv1x1, prelu, norm, dsconv) + + def forward(self, x): + """ + Args: + x: [M, B, K] + Returns: + [M, B, K] + """ + residual = x + out = self.net(x) + # TODO: when P = 3 here works fine, but when P = 2 maybe need to pad? + return out + residual # look like w/o F.relu is better than w/ F.relu + # return F.relu(out + residual) + + +class DepthwiseSeparableConv(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + norm_type="gLN", + causal=False): + super(DepthwiseSeparableConv, self).__init__() + # Use `groups` option to implement depthwise convolution + # [M, H, K] -> [M, H, K] + depthwise_conv = nn.Conv1d(in_channels, + in_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=in_channels, + bias=False) + if causal: + chomp = Chomp1d(padding) + prelu = nn.PReLU() + norm = chose_norm(norm_type, in_channels) + # [M, H, K] -> [M, B, K] + pointwise_conv = nn.Conv1d(in_channels, out_channels, 1, bias=False) + # Put together + if causal: + self.net = nn.Sequential(depthwise_conv, chomp, prelu, norm, pointwise_conv) + else: + self.net = nn.Sequential(depthwise_conv, prelu, norm, pointwise_conv) + + def forward(self, x): + """ + Args: + x: [M, H, K] + Returns: + result: [M, B, K] + """ + return self.net(x) + + +class Chomp1d(nn.Module): + """To ensure the output length is the same as the input. + """ + def __init__(self, chomp_size): + super(Chomp1d, self).__init__() + self.chomp_size = chomp_size + + def forward(self, x): + """ + Args: + x: [M, H, Kpad] + Returns: + [M, H, K] + """ + return x[:, :, :-self.chomp_size].contiguous() + + +def chose_norm(norm_type, channel_size): + """The input of normlization will be (M, C, K), where M is batch size, + C is channel size and K is sequence length. + """ + if norm_type == "gLN": + return GlobalLayerNorm(channel_size) + elif norm_type == "cLN": + return ChannelwiseLayerNorm(channel_size) + elif norm_type == "id": + return nn.Identity() + else: # norm_type == "BN": + # Given input (M, C, K), nn.BatchNorm1d(C) will accumulate statics + # along M and K, so this BN usage is right. + return nn.BatchNorm1d(channel_size) + + +# TODO: Use nn.LayerNorm to impl cLN to speed up +class ChannelwiseLayerNorm(nn.Module): + """Channel-wise Layer Normalization (cLN)""" + def __init__(self, channel_size): + super(ChannelwiseLayerNorm, self).__init__() + self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1] + self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1] + self.reset_parameters() + + def reset_parameters(self): + self.gamma.data.fill_(1) + self.beta.data.zero_() + + def forward(self, y): + """ + Args: + y: [M, N, K], M is batch size, N is channel size, K is length + Returns: + cLN_y: [M, N, K] + """ + mean = torch.mean(y, dim=1, keepdim=True) # [M, 1, K] + var = torch.var(y, dim=1, keepdim=True, unbiased=False) # [M, 1, K] + cLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta + return cLN_y + + +class GlobalLayerNorm(nn.Module): + """Global Layer Normalization (gLN)""" + def __init__(self, channel_size): + super(GlobalLayerNorm, self).__init__() + self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1] + self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1] + self.reset_parameters() + + def reset_parameters(self): + self.gamma.data.fill_(1) + self.beta.data.zero_() + + def forward(self, y): + """ + Args: + y: [M, N, K], M is batch size, N is channel size, K is length + Returns: + gLN_y: [M, N, K] + """ + # TODO: in torch 1.0, torch.mean() support dim list + mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True) # [M, 1, 1] + var = (torch.pow(y - mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True) + gLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta + return gLN_y + + +if __name__ == "__main__": + torch.manual_seed(123) + M, N, L, T = 2, 3, 4, 12 + K = 2 * T // L - 1 + B, H, P, X, R, C, norm_type, causal = 2, 3, 3, 3, 2, 2, "gLN", False + mixture = torch.randint(3, (M, T)) + # test Encoder + encoder = Encoder(L, N) + encoder.conv1d_U.weight.data = torch.randint(2, encoder.conv1d_U.weight.size()) + mixture_w = encoder(mixture) + print('mixture', mixture) + print('U', encoder.conv1d_U.weight) + print('mixture_w', mixture_w) + print('mixture_w size', mixture_w.size()) + + # test TemporalConvNet + separator = TemporalConvNet(N, B, H, P, X, R, C, norm_type=norm_type, causal=causal) + est_mask = separator(mixture_w) + print('est_mask', est_mask) + + # test Decoder + decoder = Decoder(N, L) + est_mask = torch.randint(2, (B, K, C, N)) + est_source = decoder(mixture_w, est_mask) + print('est_source', est_source) + + # test Conv-TasNet + conv_tasnet = ConvTasNet(N, L, B, H, P, X, R, C, norm_type=norm_type) + est_source = conv_tasnet(mixture) + print('est_source', est_source) + print('est_source size', est_source.size()) diff --git a/audio_separator/separator/uvr_lib_v5/demucs/tasnet_v2.py b/audio_separator/separator/uvr_lib_v5/demucs/tasnet_v2.py new file mode 100644 index 0000000..ecc1257 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/demucs/tasnet_v2.py @@ -0,0 +1,452 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# +# Created on 2018/12 +# Author: Kaituo XU +# Modified on 2019/11 by Alexandre Defossez, added support for multiple output channels +# Here is the original license: +# The MIT License (MIT) +# +# Copyright (c) 2018 Kaituo XU +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .utils import capture_init + +EPS = 1e-8 + + +def overlap_and_add(signal, frame_step): + outer_dimensions = signal.size()[:-2] + frames, frame_length = signal.size()[-2:] + + subframe_length = math.gcd(frame_length, frame_step) # gcd=Greatest Common Divisor + subframe_step = frame_step // subframe_length + subframes_per_frame = frame_length // subframe_length + output_size = frame_step * (frames - 1) + frame_length + output_subframes = output_size // subframe_length + + subframe_signal = signal.view(*outer_dimensions, -1, subframe_length) + + frame = torch.arange(0, output_subframes, + device=signal.device).unfold(0, subframes_per_frame, subframe_step) + frame = frame.long() # signal may in GPU or CPU + frame = frame.contiguous().view(-1) + + result = signal.new_zeros(*outer_dimensions, output_subframes, subframe_length) + result.index_add_(-2, frame, subframe_signal) + result = result.view(*outer_dimensions, -1) + return result + + +class ConvTasNet(nn.Module): + @capture_init + def __init__(self, + sources, + N=256, + L=20, + B=256, + H=512, + P=3, + X=8, + R=4, + audio_channels=2, + norm_type="gLN", + causal=False, + mask_nonlinear='relu', + samplerate=44100, + segment_length=44100 * 2 * 4): + """ + Args: + sources: list of sources + N: Number of filters in autoencoder + L: Length of the filters (in samples) + B: Number of channels in bottleneck 1 × 1-conv block + H: Number of channels in convolutional blocks + P: Kernel size in convolutional blocks + X: Number of convolutional blocks in each repeat + R: Number of repeats + norm_type: BN, gLN, cLN + causal: causal or non-causal + mask_nonlinear: use which non-linear function to generate mask + """ + super(ConvTasNet, self).__init__() + # Hyper-parameter + self.sources = sources + self.C = len(sources) + self.N, self.L, self.B, self.H, self.P, self.X, self.R = N, L, B, H, P, X, R + self.norm_type = norm_type + self.causal = causal + self.mask_nonlinear = mask_nonlinear + self.audio_channels = audio_channels + self.samplerate = samplerate + self.segment_length = segment_length + # Components + self.encoder = Encoder(L, N, audio_channels) + self.separator = TemporalConvNet( + N, B, H, P, X, R, self.C, norm_type, causal, mask_nonlinear) + self.decoder = Decoder(N, L, audio_channels) + # init + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_normal_(p) + + def valid_length(self, length): + return length + + def forward(self, mixture): + """ + Args: + mixture: [M, T], M is batch size, T is #samples + Returns: + est_source: [M, C, T] + """ + mixture_w = self.encoder(mixture) + est_mask = self.separator(mixture_w) + est_source = self.decoder(mixture_w, est_mask) + + # T changed after conv1d in encoder, fix it here + T_origin = mixture.size(-1) + T_conv = est_source.size(-1) + est_source = F.pad(est_source, (0, T_origin - T_conv)) + return est_source + + +class Encoder(nn.Module): + """Estimation of the nonnegative mixture weight by a 1-D conv layer. + """ + def __init__(self, L, N, audio_channels): + super(Encoder, self).__init__() + # Hyper-parameter + self.L, self.N = L, N + # Components + # 50% overlap + self.conv1d_U = nn.Conv1d(audio_channels, N, kernel_size=L, stride=L // 2, bias=False) + + def forward(self, mixture): + """ + Args: + mixture: [M, T], M is batch size, T is #samples + Returns: + mixture_w: [M, N, K], where K = (T-L)/(L/2)+1 = 2T/L-1 + """ + mixture_w = F.relu(self.conv1d_U(mixture)) # [M, N, K] + return mixture_w + + +class Decoder(nn.Module): + def __init__(self, N, L, audio_channels): + super(Decoder, self).__init__() + # Hyper-parameter + self.N, self.L = N, L + self.audio_channels = audio_channels + # Components + self.basis_signals = nn.Linear(N, audio_channels * L, bias=False) + + def forward(self, mixture_w, est_mask): + """ + Args: + mixture_w: [M, N, K] + est_mask: [M, C, N, K] + Returns: + est_source: [M, C, T] + """ + # D = W * M + source_w = torch.unsqueeze(mixture_w, 1) * est_mask # [M, C, N, K] + source_w = torch.transpose(source_w, 2, 3) # [M, C, K, N] + # S = DV + est_source = self.basis_signals(source_w) # [M, C, K, ac * L] + m, c, k, _ = est_source.size() + est_source = est_source.view(m, c, k, self.audio_channels, -1).transpose(2, 3).contiguous() + est_source = overlap_and_add(est_source, self.L // 2) # M x C x ac x T + return est_source + + +class TemporalConvNet(nn.Module): + def __init__(self, N, B, H, P, X, R, C, norm_type="gLN", causal=False, mask_nonlinear='relu'): + """ + Args: + N: Number of filters in autoencoder + B: Number of channels in bottleneck 1 × 1-conv block + H: Number of channels in convolutional blocks + P: Kernel size in convolutional blocks + X: Number of convolutional blocks in each repeat + R: Number of repeats + C: Number of speakers + norm_type: BN, gLN, cLN + causal: causal or non-causal + mask_nonlinear: use which non-linear function to generate mask + """ + super(TemporalConvNet, self).__init__() + # Hyper-parameter + self.C = C + self.mask_nonlinear = mask_nonlinear + # Components + # [M, N, K] -> [M, N, K] + layer_norm = ChannelwiseLayerNorm(N) + # [M, N, K] -> [M, B, K] + bottleneck_conv1x1 = nn.Conv1d(N, B, 1, bias=False) + # [M, B, K] -> [M, B, K] + repeats = [] + for r in range(R): + blocks = [] + for x in range(X): + dilation = 2**x + padding = (P - 1) * dilation if causal else (P - 1) * dilation // 2 + blocks += [ + TemporalBlock(B, + H, + P, + stride=1, + padding=padding, + dilation=dilation, + norm_type=norm_type, + causal=causal) + ] + repeats += [nn.Sequential(*blocks)] + temporal_conv_net = nn.Sequential(*repeats) + # [M, B, K] -> [M, C*N, K] + mask_conv1x1 = nn.Conv1d(B, C * N, 1, bias=False) + # Put together + self.network = nn.Sequential(layer_norm, bottleneck_conv1x1, temporal_conv_net, + mask_conv1x1) + + def forward(self, mixture_w): + """ + Keep this API same with TasNet + Args: + mixture_w: [M, N, K], M is batch size + returns: + est_mask: [M, C, N, K] + """ + M, N, K = mixture_w.size() + score = self.network(mixture_w) # [M, N, K] -> [M, C*N, K] + score = score.view(M, self.C, N, K) # [M, C*N, K] -> [M, C, N, K] + if self.mask_nonlinear == 'softmax': + est_mask = F.softmax(score, dim=1) + elif self.mask_nonlinear == 'relu': + est_mask = F.relu(score) + else: + raise ValueError("Unsupported mask non-linear function") + return est_mask + + +class TemporalBlock(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + norm_type="gLN", + causal=False): + super(TemporalBlock, self).__init__() + # [M, B, K] -> [M, H, K] + conv1x1 = nn.Conv1d(in_channels, out_channels, 1, bias=False) + prelu = nn.PReLU() + norm = chose_norm(norm_type, out_channels) + # [M, H, K] -> [M, B, K] + dsconv = DepthwiseSeparableConv(out_channels, in_channels, kernel_size, stride, padding, + dilation, norm_type, causal) + # Put together + self.net = nn.Sequential(conv1x1, prelu, norm, dsconv) + + def forward(self, x): + """ + Args: + x: [M, B, K] + Returns: + [M, B, K] + """ + residual = x + out = self.net(x) + # TODO: when P = 3 here works fine, but when P = 2 maybe need to pad? + return out + residual # look like w/o F.relu is better than w/ F.relu + # return F.relu(out + residual) + + +class DepthwiseSeparableConv(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + norm_type="gLN", + causal=False): + super(DepthwiseSeparableConv, self).__init__() + # Use `groups` option to implement depthwise convolution + # [M, H, K] -> [M, H, K] + depthwise_conv = nn.Conv1d(in_channels, + in_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=in_channels, + bias=False) + if causal: + chomp = Chomp1d(padding) + prelu = nn.PReLU() + norm = chose_norm(norm_type, in_channels) + # [M, H, K] -> [M, B, K] + pointwise_conv = nn.Conv1d(in_channels, out_channels, 1, bias=False) + # Put together + if causal: + self.net = nn.Sequential(depthwise_conv, chomp, prelu, norm, pointwise_conv) + else: + self.net = nn.Sequential(depthwise_conv, prelu, norm, pointwise_conv) + + def forward(self, x): + """ + Args: + x: [M, H, K] + Returns: + result: [M, B, K] + """ + return self.net(x) + + +class Chomp1d(nn.Module): + """To ensure the output length is the same as the input. + """ + def __init__(self, chomp_size): + super(Chomp1d, self).__init__() + self.chomp_size = chomp_size + + def forward(self, x): + """ + Args: + x: [M, H, Kpad] + Returns: + [M, H, K] + """ + return x[:, :, :-self.chomp_size].contiguous() + + +def chose_norm(norm_type, channel_size): + """The input of normlization will be (M, C, K), where M is batch size, + C is channel size and K is sequence length. + """ + if norm_type == "gLN": + return GlobalLayerNorm(channel_size) + elif norm_type == "cLN": + return ChannelwiseLayerNorm(channel_size) + elif norm_type == "id": + return nn.Identity() + else: # norm_type == "BN": + # Given input (M, C, K), nn.BatchNorm1d(C) will accumulate statics + # along M and K, so this BN usage is right. + return nn.BatchNorm1d(channel_size) + + +# TODO: Use nn.LayerNorm to impl cLN to speed up +class ChannelwiseLayerNorm(nn.Module): + """Channel-wise Layer Normalization (cLN)""" + def __init__(self, channel_size): + super(ChannelwiseLayerNorm, self).__init__() + self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1] + self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1] + self.reset_parameters() + + def reset_parameters(self): + self.gamma.data.fill_(1) + self.beta.data.zero_() + + def forward(self, y): + """ + Args: + y: [M, N, K], M is batch size, N is channel size, K is length + Returns: + cLN_y: [M, N, K] + """ + mean = torch.mean(y, dim=1, keepdim=True) # [M, 1, K] + var = torch.var(y, dim=1, keepdim=True, unbiased=False) # [M, 1, K] + cLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta + return cLN_y + + +class GlobalLayerNorm(nn.Module): + """Global Layer Normalization (gLN)""" + def __init__(self, channel_size): + super(GlobalLayerNorm, self).__init__() + self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1] + self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1] + self.reset_parameters() + + def reset_parameters(self): + self.gamma.data.fill_(1) + self.beta.data.zero_() + + def forward(self, y): + """ + Args: + y: [M, N, K], M is batch size, N is channel size, K is length + Returns: + gLN_y: [M, N, K] + """ + # TODO: in torch 1.0, torch.mean() support dim list + mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True) # [M, 1, 1] + var = (torch.pow(y - mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True) + gLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta + return gLN_y + + +if __name__ == "__main__": + torch.manual_seed(123) + M, N, L, T = 2, 3, 4, 12 + K = 2 * T // L - 1 + B, H, P, X, R, C, norm_type, causal = 2, 3, 3, 3, 2, 2, "gLN", False + mixture = torch.randint(3, (M, T)) + # test Encoder + encoder = Encoder(L, N) + encoder.conv1d_U.weight.data = torch.randint(2, encoder.conv1d_U.weight.size()) + mixture_w = encoder(mixture) + print('mixture', mixture) + print('U', encoder.conv1d_U.weight) + print('mixture_w', mixture_w) + print('mixture_w size', mixture_w.size()) + + # test TemporalConvNet + separator = TemporalConvNet(N, B, H, P, X, R, C, norm_type=norm_type, causal=causal) + est_mask = separator(mixture_w) + print('est_mask', est_mask) + + # test Decoder + decoder = Decoder(N, L) + est_mask = torch.randint(2, (B, K, C, N)) + est_source = decoder(mixture_w, est_mask) + print('est_source', est_source) + + # test Conv-TasNet + conv_tasnet = ConvTasNet(N, L, B, H, P, X, R, C, norm_type=norm_type) + est_source = conv_tasnet(mixture) + print('est_source', est_source) + print('est_source size', est_source.size()) diff --git a/audio_separator/separator/uvr_lib_v5/demucs/transformer.py b/audio_separator/separator/uvr_lib_v5/demucs/transformer.py new file mode 100644 index 0000000..56a465b --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/demucs/transformer.py @@ -0,0 +1,839 @@ +# Copyright (c) 2019-present, Meta, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# First author is Simon Rouard. + +import random +import typing as tp + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +import math +from einops import rearrange + + +def create_sin_embedding( + length: int, dim: int, shift: int = 0, device="cpu", max_period=10000 +): + # We aim for TBC format + assert dim % 2 == 0 + pos = shift + torch.arange(length, device=device).view(-1, 1, 1) + half_dim = dim // 2 + adim = torch.arange(dim // 2, device=device).view(1, 1, -1) + phase = pos / (max_period ** (adim / (half_dim - 1))) + return torch.cat( + [ + torch.cos(phase), + torch.sin(phase), + ], + dim=-1, + ) + + +def create_2d_sin_embedding(d_model, height, width, device="cpu", max_period=10000): + """ + :param d_model: dimension of the model + :param height: height of the positions + :param width: width of the positions + :return: d_model*height*width position matrix + """ + if d_model % 4 != 0: + raise ValueError( + "Cannot use sin/cos positional encoding with " + "odd dimension (got dim={:d})".format(d_model) + ) + pe = torch.zeros(d_model, height, width) + # Each dimension use half of d_model + d_model = int(d_model / 2) + div_term = torch.exp( + torch.arange(0.0, d_model, 2) * -(math.log(max_period) / d_model) + ) + pos_w = torch.arange(0.0, width).unsqueeze(1) + pos_h = torch.arange(0.0, height).unsqueeze(1) + pe[0:d_model:2, :, :] = ( + torch.sin(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1) + ) + pe[1:d_model:2, :, :] = ( + torch.cos(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1) + ) + pe[d_model::2, :, :] = ( + torch.sin(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width) + ) + pe[d_model + 1:: 2, :, :] = ( + torch.cos(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width) + ) + + return pe[None, :].to(device) + + +def create_sin_embedding_cape( + length: int, + dim: int, + batch_size: int, + mean_normalize: bool, + augment: bool, # True during training + max_global_shift: float = 0.0, # delta max + max_local_shift: float = 0.0, # epsilon max + max_scale: float = 1.0, + device: str = "cpu", + max_period: float = 10000.0, +): + # We aim for TBC format + assert dim % 2 == 0 + pos = 1.0 * torch.arange(length).view(-1, 1, 1) # (length, 1, 1) + pos = pos.repeat(1, batch_size, 1) # (length, batch_size, 1) + if mean_normalize: + pos -= torch.nanmean(pos, dim=0, keepdim=True) + + if augment: + delta = np.random.uniform( + -max_global_shift, +max_global_shift, size=[1, batch_size, 1] + ) + delta_local = np.random.uniform( + -max_local_shift, +max_local_shift, size=[length, batch_size, 1] + ) + log_lambdas = np.random.uniform( + -np.log(max_scale), +np.log(max_scale), size=[1, batch_size, 1] + ) + pos = (pos + delta + delta_local) * np.exp(log_lambdas) + + pos = pos.to(device) + + half_dim = dim // 2 + adim = torch.arange(dim // 2, device=device).view(1, 1, -1) + phase = pos / (max_period ** (adim / (half_dim - 1))) + return torch.cat( + [ + torch.cos(phase), + torch.sin(phase), + ], + dim=-1, + ).float() + + +def get_causal_mask(length): + pos = torch.arange(length) + return pos > pos[:, None] + + +def get_elementary_mask( + T1, + T2, + mask_type, + sparse_attn_window, + global_window, + mask_random_seed, + sparsity, + device, +): + """ + When the input of the Decoder has length T1 and the output T2 + The mask matrix has shape (T2, T1) + """ + assert mask_type in ["diag", "jmask", "random", "global"] + + if mask_type == "global": + mask = torch.zeros(T2, T1, dtype=torch.bool) + mask[:, :global_window] = True + line_window = int(global_window * T2 / T1) + mask[:line_window, :] = True + + if mask_type == "diag": + + mask = torch.zeros(T2, T1, dtype=torch.bool) + rows = torch.arange(T2)[:, None] + cols = ( + (T1 / T2 * rows + torch.arange(-sparse_attn_window, sparse_attn_window + 1)) + .long() + .clamp(0, T1 - 1) + ) + mask.scatter_(1, cols, torch.ones(1, dtype=torch.bool).expand_as(cols)) + + elif mask_type == "jmask": + mask = torch.zeros(T2 + 2, T1 + 2, dtype=torch.bool) + rows = torch.arange(T2 + 2)[:, None] + t = torch.arange(0, int((2 * T1) ** 0.5 + 1)) + t = (t * (t + 1) / 2).int() + t = torch.cat([-t.flip(0)[:-1], t]) + cols = (T1 / T2 * rows + t).long().clamp(0, T1 + 1) + mask.scatter_(1, cols, torch.ones(1, dtype=torch.bool).expand_as(cols)) + mask = mask[1:-1, 1:-1] + + elif mask_type == "random": + gene = torch.Generator(device=device) + gene.manual_seed(mask_random_seed) + mask = ( + torch.rand(T1 * T2, generator=gene, device=device).reshape(T2, T1) + > sparsity + ) + + mask = mask.to(device) + return mask + + +def get_mask( + T1, + T2, + mask_type, + sparse_attn_window, + global_window, + mask_random_seed, + sparsity, + device, +): + """ + Return a SparseCSRTensor mask that is a combination of elementary masks + mask_type can be a combination of multiple masks: for instance "diag_jmask_random" + """ + from xformers.sparse import SparseCSRTensor + # create a list + mask_types = mask_type.split("_") + + all_masks = [ + get_elementary_mask( + T1, + T2, + mask, + sparse_attn_window, + global_window, + mask_random_seed, + sparsity, + device, + ) + for mask in mask_types + ] + + final_mask = torch.stack(all_masks).sum(axis=0) > 0 + + return SparseCSRTensor.from_dense(final_mask[None]) + + +class ScaledEmbedding(nn.Module): + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + scale: float = 1.0, + boost: float = 3.0, + ): + super().__init__() + self.embedding = nn.Embedding(num_embeddings, embedding_dim) + self.embedding.weight.data *= scale / boost + self.boost = boost + + @property + def weight(self): + return self.embedding.weight * self.boost + + def forward(self, x): + return self.embedding(x) * self.boost + + +class LayerScale(nn.Module): + """Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf). + This rescales diagonaly residual outputs close to 0 initially, then learnt. + """ + + def __init__(self, channels: int, init: float = 0, channel_last=False): + """ + channel_last = False corresponds to (B, C, T) tensors + channel_last = True corresponds to (T, B, C) tensors + """ + super().__init__() + self.channel_last = channel_last + self.scale = nn.Parameter(torch.zeros(channels, requires_grad=True)) + self.scale.data[:] = init + + def forward(self, x): + if self.channel_last: + return self.scale * x + else: + return self.scale[:, None] * x + + +class MyGroupNorm(nn.GroupNorm): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def forward(self, x): + """ + x: (B, T, C) + if num_groups=1: Normalisation on all T and C together for each B + """ + x = x.transpose(1, 2) + return super().forward(x).transpose(1, 2) + + +class MyTransformerEncoderLayer(nn.TransformerEncoderLayer): + def __init__( + self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation=F.relu, + group_norm=0, + norm_first=False, + norm_out=False, + layer_norm_eps=1e-5, + layer_scale=False, + init_values=1e-4, + device=None, + dtype=None, + sparse=False, + mask_type="diag", + mask_random_seed=42, + sparse_attn_window=500, + global_window=50, + auto_sparsity=False, + sparsity=0.95, + batch_first=False, + ): + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__( + d_model=d_model, + nhead=nhead, + dim_feedforward=dim_feedforward, + dropout=dropout, + activation=activation, + layer_norm_eps=layer_norm_eps, + batch_first=batch_first, + norm_first=norm_first, + device=device, + dtype=dtype, + ) + self.sparse = sparse + self.auto_sparsity = auto_sparsity + if sparse: + if not auto_sparsity: + self.mask_type = mask_type + self.sparse_attn_window = sparse_attn_window + self.global_window = global_window + self.sparsity = sparsity + if group_norm: + self.norm1 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs) + self.norm2 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs) + + self.norm_out = None + if self.norm_first & norm_out: + self.norm_out = MyGroupNorm(num_groups=int(norm_out), num_channels=d_model) + self.gamma_1 = ( + LayerScale(d_model, init_values, True) if layer_scale else nn.Identity() + ) + self.gamma_2 = ( + LayerScale(d_model, init_values, True) if layer_scale else nn.Identity() + ) + + if sparse: + self.self_attn = MultiheadAttention( + d_model, nhead, dropout=dropout, batch_first=batch_first, + auto_sparsity=sparsity if auto_sparsity else 0, + ) + self.__setattr__("src_mask", torch.zeros(1, 1)) + self.mask_random_seed = mask_random_seed + + def forward(self, src, src_mask=None, src_key_padding_mask=None): + """ + if batch_first = False, src shape is (T, B, C) + the case where batch_first=True is not covered + """ + device = src.device + x = src + T, B, C = x.shape + if self.sparse and not self.auto_sparsity: + assert src_mask is None + src_mask = self.src_mask + if src_mask.shape[-1] != T: + src_mask = get_mask( + T, + T, + self.mask_type, + self.sparse_attn_window, + self.global_window, + self.mask_random_seed, + self.sparsity, + device, + ) + self.__setattr__("src_mask", src_mask) + + if self.norm_first: + x = x + self.gamma_1( + self._sa_block(self.norm1(x), src_mask, src_key_padding_mask) + ) + x = x + self.gamma_2(self._ff_block(self.norm2(x))) + + if self.norm_out: + x = self.norm_out(x) + else: + x = self.norm1( + x + self.gamma_1(self._sa_block(x, src_mask, src_key_padding_mask)) + ) + x = self.norm2(x + self.gamma_2(self._ff_block(x))) + + return x + + +class CrossTransformerEncoderLayer(nn.Module): + def __init__( + self, + d_model: int, + nhead: int, + dim_feedforward: int = 2048, + dropout: float = 0.1, + activation=F.relu, + layer_norm_eps: float = 1e-5, + layer_scale: bool = False, + init_values: float = 1e-4, + norm_first: bool = False, + group_norm: bool = False, + norm_out: bool = False, + sparse=False, + mask_type="diag", + mask_random_seed=42, + sparse_attn_window=500, + global_window=50, + sparsity=0.95, + auto_sparsity=None, + device=None, + dtype=None, + batch_first=False, + ): + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + + self.sparse = sparse + self.auto_sparsity = auto_sparsity + if sparse: + if not auto_sparsity: + self.mask_type = mask_type + self.sparse_attn_window = sparse_attn_window + self.global_window = global_window + self.sparsity = sparsity + + self.cross_attn: nn.Module + self.cross_attn = nn.MultiheadAttention( + d_model, nhead, dropout=dropout, batch_first=batch_first) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward, **factory_kwargs) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model, **factory_kwargs) + + self.norm_first = norm_first + self.norm1: nn.Module + self.norm2: nn.Module + self.norm3: nn.Module + if group_norm: + self.norm1 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs) + self.norm2 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs) + self.norm3 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs) + else: + self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) + self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) + self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) + + self.norm_out = None + if self.norm_first & norm_out: + self.norm_out = MyGroupNorm(num_groups=int(norm_out), num_channels=d_model) + + self.gamma_1 = ( + LayerScale(d_model, init_values, True) if layer_scale else nn.Identity() + ) + self.gamma_2 = ( + LayerScale(d_model, init_values, True) if layer_scale else nn.Identity() + ) + + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + # Legacy string support for activation function. + if isinstance(activation, str): + self.activation = self._get_activation_fn(activation) + else: + self.activation = activation + + if sparse: + self.cross_attn = MultiheadAttention( + d_model, nhead, dropout=dropout, batch_first=batch_first, + auto_sparsity=sparsity if auto_sparsity else 0) + if not auto_sparsity: + self.__setattr__("mask", torch.zeros(1, 1)) + self.mask_random_seed = mask_random_seed + + def forward(self, q, k, mask=None): + """ + Args: + q: tensor of shape (T, B, C) + k: tensor of shape (S, B, C) + mask: tensor of shape (T, S) + + """ + device = q.device + T, B, C = q.shape + S, B, C = k.shape + if self.sparse and not self.auto_sparsity: + assert mask is None + mask = self.mask + if mask.shape[-1] != S or mask.shape[-2] != T: + mask = get_mask( + S, + T, + self.mask_type, + self.sparse_attn_window, + self.global_window, + self.mask_random_seed, + self.sparsity, + device, + ) + self.__setattr__("mask", mask) + + if self.norm_first: + x = q + self.gamma_1(self._ca_block(self.norm1(q), self.norm2(k), mask)) + x = x + self.gamma_2(self._ff_block(self.norm3(x))) + if self.norm_out: + x = self.norm_out(x) + else: + x = self.norm1(q + self.gamma_1(self._ca_block(q, k, mask))) + x = self.norm2(x + self.gamma_2(self._ff_block(x))) + + return x + + # self-attention block + def _ca_block(self, q, k, attn_mask=None): + x = self.cross_attn(q, k, k, attn_mask=attn_mask, need_weights=False)[0] + return self.dropout1(x) + + # feed forward block + def _ff_block(self, x): + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + return self.dropout2(x) + + def _get_activation_fn(self, activation): + if activation == "relu": + return F.relu + elif activation == "gelu": + return F.gelu + + raise RuntimeError("activation should be relu/gelu, not {}".format(activation)) + + +# ----------------- MULTI-BLOCKS MODELS: ----------------------- + + +class CrossTransformerEncoder(nn.Module): + def __init__( + self, + dim: int, + emb: str = "sin", + hidden_scale: float = 4.0, + num_heads: int = 8, + num_layers: int = 6, + cross_first: bool = False, + dropout: float = 0.0, + max_positions: int = 1000, + norm_in: bool = True, + norm_in_group: bool = False, + group_norm: int = False, + norm_first: bool = False, + norm_out: bool = False, + max_period: float = 10000.0, + weight_decay: float = 0.0, + lr: tp.Optional[float] = None, + layer_scale: bool = False, + gelu: bool = True, + sin_random_shift: int = 0, + weight_pos_embed: float = 1.0, + cape_mean_normalize: bool = True, + cape_augment: bool = True, + cape_glob_loc_scale: list = [5000.0, 1.0, 1.4], + sparse_self_attn: bool = False, + sparse_cross_attn: bool = False, + mask_type: str = "diag", + mask_random_seed: int = 42, + sparse_attn_window: int = 500, + global_window: int = 50, + auto_sparsity: bool = False, + sparsity: float = 0.95, + ): + super().__init__() + """ + """ + assert dim % num_heads == 0 + + hidden_dim = int(dim * hidden_scale) + + self.num_layers = num_layers + # classic parity = 1 means that if idx%2 == 1 there is a + # classical encoder else there is a cross encoder + self.classic_parity = 1 if cross_first else 0 + self.emb = emb + self.max_period = max_period + self.weight_decay = weight_decay + self.weight_pos_embed = weight_pos_embed + self.sin_random_shift = sin_random_shift + if emb == "cape": + self.cape_mean_normalize = cape_mean_normalize + self.cape_augment = cape_augment + self.cape_glob_loc_scale = cape_glob_loc_scale + if emb == "scaled": + self.position_embeddings = ScaledEmbedding(max_positions, dim, scale=0.2) + + self.lr = lr + + activation: tp.Any = F.gelu if gelu else F.relu + + self.norm_in: nn.Module + self.norm_in_t: nn.Module + if norm_in: + self.norm_in = nn.LayerNorm(dim) + self.norm_in_t = nn.LayerNorm(dim) + elif norm_in_group: + self.norm_in = MyGroupNorm(int(norm_in_group), dim) + self.norm_in_t = MyGroupNorm(int(norm_in_group), dim) + else: + self.norm_in = nn.Identity() + self.norm_in_t = nn.Identity() + + # spectrogram layers + self.layers = nn.ModuleList() + # temporal layers + self.layers_t = nn.ModuleList() + + kwargs_common = { + "d_model": dim, + "nhead": num_heads, + "dim_feedforward": hidden_dim, + "dropout": dropout, + "activation": activation, + "group_norm": group_norm, + "norm_first": norm_first, + "norm_out": norm_out, + "layer_scale": layer_scale, + "mask_type": mask_type, + "mask_random_seed": mask_random_seed, + "sparse_attn_window": sparse_attn_window, + "global_window": global_window, + "sparsity": sparsity, + "auto_sparsity": auto_sparsity, + "batch_first": True, + } + + kwargs_classic_encoder = dict(kwargs_common) + kwargs_classic_encoder.update({ + "sparse": sparse_self_attn, + }) + kwargs_cross_encoder = dict(kwargs_common) + kwargs_cross_encoder.update({ + "sparse": sparse_cross_attn, + }) + + for idx in range(num_layers): + if idx % 2 == self.classic_parity: + + self.layers.append(MyTransformerEncoderLayer(**kwargs_classic_encoder)) + self.layers_t.append( + MyTransformerEncoderLayer(**kwargs_classic_encoder) + ) + + else: + self.layers.append(CrossTransformerEncoderLayer(**kwargs_cross_encoder)) + + self.layers_t.append( + CrossTransformerEncoderLayer(**kwargs_cross_encoder) + ) + + def forward(self, x, xt): + B, C, Fr, T1 = x.shape + pos_emb_2d = create_2d_sin_embedding( + C, Fr, T1, x.device, self.max_period + ) # (1, C, Fr, T1) + pos_emb_2d = rearrange(pos_emb_2d, "b c fr t1 -> b (t1 fr) c") + x = rearrange(x, "b c fr t1 -> b (t1 fr) c") + x = self.norm_in(x) + x = x + self.weight_pos_embed * pos_emb_2d + + B, C, T2 = xt.shape + xt = rearrange(xt, "b c t2 -> b t2 c") # now T2, B, C + pos_emb = self._get_pos_embedding(T2, B, C, x.device) + pos_emb = rearrange(pos_emb, "t2 b c -> b t2 c") + xt = self.norm_in_t(xt) + xt = xt + self.weight_pos_embed * pos_emb + + for idx in range(self.num_layers): + if idx % 2 == self.classic_parity: + x = self.layers[idx](x) + xt = self.layers_t[idx](xt) + else: + old_x = x + x = self.layers[idx](x, xt) + xt = self.layers_t[idx](xt, old_x) + + x = rearrange(x, "b (t1 fr) c -> b c fr t1", t1=T1) + xt = rearrange(xt, "b t2 c -> b c t2") + return x, xt + + def _get_pos_embedding(self, T, B, C, device): + if self.emb == "sin": + shift = random.randrange(self.sin_random_shift + 1) + pos_emb = create_sin_embedding( + T, C, shift=shift, device=device, max_period=self.max_period + ) + elif self.emb == "cape": + if self.training: + pos_emb = create_sin_embedding_cape( + T, + C, + B, + device=device, + max_period=self.max_period, + mean_normalize=self.cape_mean_normalize, + augment=self.cape_augment, + max_global_shift=self.cape_glob_loc_scale[0], + max_local_shift=self.cape_glob_loc_scale[1], + max_scale=self.cape_glob_loc_scale[2], + ) + else: + pos_emb = create_sin_embedding_cape( + T, + C, + B, + device=device, + max_period=self.max_period, + mean_normalize=self.cape_mean_normalize, + augment=False, + ) + + elif self.emb == "scaled": + pos = torch.arange(T, device=device) + pos_emb = self.position_embeddings(pos)[:, None] + + return pos_emb + + def make_optim_group(self): + group = {"params": list(self.parameters()), "weight_decay": self.weight_decay} + if self.lr is not None: + group["lr"] = self.lr + return group + + +# Attention Modules + + +class MultiheadAttention(nn.Module): + def __init__( + self, + embed_dim, + num_heads, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + kdim=None, + vdim=None, + batch_first=False, + auto_sparsity=None, + ): + super().__init__() + assert auto_sparsity is not None, "sanity check" + self.num_heads = num_heads + self.q = torch.nn.Linear(embed_dim, embed_dim, bias=bias) + self.k = torch.nn.Linear(embed_dim, embed_dim, bias=bias) + self.v = torch.nn.Linear(embed_dim, embed_dim, bias=bias) + self.attn_drop = torch.nn.Dropout(dropout) + self.proj = torch.nn.Linear(embed_dim, embed_dim, bias) + self.proj_drop = torch.nn.Dropout(dropout) + self.batch_first = batch_first + self.auto_sparsity = auto_sparsity + + def forward( + self, + query, + key, + value, + key_padding_mask=None, + need_weights=True, + attn_mask=None, + average_attn_weights=True, + ): + + if not self.batch_first: # N, B, C + query = query.permute(1, 0, 2) # B, N_q, C + key = key.permute(1, 0, 2) # B, N_k, C + value = value.permute(1, 0, 2) # B, N_k, C + B, N_q, C = query.shape + B, N_k, C = key.shape + + q = ( + self.q(query) + .reshape(B, N_q, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) + q = q.flatten(0, 1) + k = ( + self.k(key) + .reshape(B, N_k, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) + k = k.flatten(0, 1) + v = ( + self.v(value) + .reshape(B, N_k, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) + v = v.flatten(0, 1) + + if self.auto_sparsity: + assert attn_mask is None + x = dynamic_sparse_attention(q, k, v, sparsity=self.auto_sparsity) + else: + x = scaled_dot_product_attention(q, k, v, attn_mask, dropout=self.attn_drop) + x = x.reshape(B, self.num_heads, N_q, C // self.num_heads) + + x = x.transpose(1, 2).reshape(B, N_q, C) + x = self.proj(x) + x = self.proj_drop(x) + if not self.batch_first: + x = x.permute(1, 0, 2) + return x, None + + +def scaled_query_key_softmax(q, k, att_mask): + from xformers.ops import masked_matmul + q = q / (k.size(-1)) ** 0.5 + att = masked_matmul(q, k.transpose(-2, -1), att_mask) + att = torch.nn.functional.softmax(att, -1) + return att + + +def scaled_dot_product_attention(q, k, v, att_mask, dropout): + att = scaled_query_key_softmax(q, k, att_mask=att_mask) + att = dropout(att) + y = att @ v + return y + + +def _compute_buckets(x, R): + qq = torch.einsum('btf,bfhi->bhti', x, R) + qq = torch.cat([qq, -qq], dim=-1) + buckets = qq.argmax(dim=-1) + + return buckets.permute(0, 2, 1).byte().contiguous() + + +def dynamic_sparse_attention(query, key, value, sparsity, infer_sparsity=True, attn_bias=None): + # assert False, "The code for the custom sparse kernel is not ready for release yet." + from xformers.ops import find_locations, sparse_memory_efficient_attention + n_hashes = 32 + proj_size = 4 + query, key, value = [x.contiguous() for x in [query, key, value]] + with torch.no_grad(): + R = torch.randn(1, query.shape[-1], n_hashes, proj_size // 2, device=query.device) + bucket_query = _compute_buckets(query, R) + bucket_key = _compute_buckets(key, R) + row_offsets, column_indices = find_locations( + bucket_query, bucket_key, sparsity, infer_sparsity) + return sparse_memory_efficient_attention( + query, key, value, row_offsets, column_indices, attn_bias) diff --git a/audio_separator/separator/uvr_lib_v5/demucs/utils.py b/audio_separator/separator/uvr_lib_v5/demucs/utils.py new file mode 100644 index 0000000..94bd323 --- /dev/null +++ b/audio_separator/separator/uvr_lib_v5/demucs/utils.py @@ -0,0 +1,502 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from collections import defaultdict +from contextlib import contextmanager +import math +import os +import tempfile +import typing as tp + +import errno +import functools +import hashlib +import inspect +import io +import os +import random +import socket +import tempfile +import warnings +import zlib +import tkinter as tk + +from diffq import UniformQuantizer, DiffQuantizer +import torch as th +import tqdm +from torch import distributed +from torch.nn import functional as F + +import torch + +def unfold(a, kernel_size, stride): + """Given input of size [*OT, T], output Tensor of size [*OT, F, K] + with K the kernel size, by extracting frames with the given stride. + + This will pad the input so that `F = ceil(T / K)`. + + see https://github.com/pytorch/pytorch/issues/60466 + """ + *shape, length = a.shape + n_frames = math.ceil(length / stride) + tgt_length = (n_frames - 1) * stride + kernel_size + a = F.pad(a, (0, tgt_length - length)) + strides = list(a.stride()) + assert strides[-1] == 1, 'data should be contiguous' + strides = strides[:-1] + [stride, 1] + return a.as_strided([*shape, n_frames, kernel_size], strides) + + +def center_trim(tensor: torch.Tensor, reference: tp.Union[torch.Tensor, int]): + """ + Center trim `tensor` with respect to `reference`, along the last dimension. + `reference` can also be a number, representing the length to trim to. + If the size difference != 0 mod 2, the extra sample is removed on the right side. + """ + ref_size: int + if isinstance(reference, torch.Tensor): + ref_size = reference.size(-1) + else: + ref_size = reference + delta = tensor.size(-1) - ref_size + if delta < 0: + raise ValueError("tensor must be larger than reference. " f"Delta is {delta}.") + if delta: + tensor = tensor[..., delta // 2:-(delta - delta // 2)] + return tensor + + +def pull_metric(history: tp.List[dict], name: str): + out = [] + for metrics in history: + metric = metrics + for part in name.split("."): + metric = metric[part] + out.append(metric) + return out + + +def EMA(beta: float = 1): + """ + Exponential Moving Average callback. + Returns a single function that can be called to repeatidly update the EMA + with a dict of metrics. The callback will return + the new averaged dict of metrics. + + Note that for `beta=1`, this is just plain averaging. + """ + fix: tp.Dict[str, float] = defaultdict(float) + total: tp.Dict[str, float] = defaultdict(float) + + def _update(metrics: dict, weight: float = 1) -> dict: + nonlocal total, fix + for key, value in metrics.items(): + total[key] = total[key] * beta + weight * float(value) + fix[key] = fix[key] * beta + weight + return {key: tot / fix[key] for key, tot in total.items()} + return _update + + +def sizeof_fmt(num: float, suffix: str = 'B'): + """ + Given `num` bytes, return human readable size. + Taken from https://stackoverflow.com/a/1094933 + """ + for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: + if abs(num) < 1024.0: + return "%3.1f%s%s" % (num, unit, suffix) + num /= 1024.0 + return "%.1f%s%s" % (num, 'Yi', suffix) + + +@contextmanager +def temp_filenames(count: int, delete=True): + names = [] + try: + for _ in range(count): + names.append(tempfile.NamedTemporaryFile(delete=False).name) + yield names + finally: + if delete: + for name in names: + os.unlink(name) + +def average_metric(metric, count=1.): + """ + Average `metric` which should be a float across all hosts. `count` should be + the weight for this particular host (i.e. number of examples). + """ + metric = th.tensor([count, count * metric], dtype=th.float32, device='cuda') + distributed.all_reduce(metric, op=distributed.ReduceOp.SUM) + return metric[1].item() / metric[0].item() + + +def free_port(host='', low=20000, high=40000): + """ + Return a port number that is most likely free. + This could suffer from a race condition although + it should be quite rare. + """ + sock = socket.socket() + while True: + port = random.randint(low, high) + try: + sock.bind((host, port)) + except OSError as error: + if error.errno == errno.EADDRINUSE: + continue + raise + return port + + +def sizeof_fmt(num, suffix='B'): + """ + Given `num` bytes, return human readable size. + Taken from https://stackoverflow.com/a/1094933 + """ + for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: + if abs(num) < 1024.0: + return "%3.1f%s%s" % (num, unit, suffix) + num /= 1024.0 + return "%.1f%s%s" % (num, 'Yi', suffix) + + +def human_seconds(seconds, display='.2f'): + """ + Given `seconds` seconds, return human readable duration. + """ + value = seconds * 1e6 + ratios = [1e3, 1e3, 60, 60, 24] + names = ['us', 'ms', 's', 'min', 'hrs', 'days'] + last = names.pop(0) + for name, ratio in zip(names, ratios): + if value / ratio < 0.3: + break + value /= ratio + last = name + return f"{format(value, display)} {last}" + + +class TensorChunk: + def __init__(self, tensor, offset=0, length=None): + total_length = tensor.shape[-1] + assert offset >= 0 + assert offset < total_length + + if length is None: + length = total_length - offset + else: + length = min(total_length - offset, length) + + self.tensor = tensor + self.offset = offset + self.length = length + self.device = tensor.device + + @property + def shape(self): + shape = list(self.tensor.shape) + shape[-1] = self.length + return shape + + def padded(self, target_length): + delta = target_length - self.length + total_length = self.tensor.shape[-1] + assert delta >= 0 + + start = self.offset - delta // 2 + end = start + target_length + + correct_start = max(0, start) + correct_end = min(total_length, end) + + pad_left = correct_start - start + pad_right = end - correct_end + + out = F.pad(self.tensor[..., correct_start:correct_end], (pad_left, pad_right)) + assert out.shape[-1] == target_length + return out + + +def tensor_chunk(tensor_or_chunk): + if isinstance(tensor_or_chunk, TensorChunk): + return tensor_or_chunk + else: + assert isinstance(tensor_or_chunk, th.Tensor) + return TensorChunk(tensor_or_chunk) + + +def apply_model_v1(model, mix, shifts=None, split=False, progress=False, set_progress_bar=None): + """ + Apply model to a given mixture. + + Args: + shifts (int): if > 0, will shift in time `mix` by a random amount between 0 and 0.5 sec + and apply the oppositve shift to the output. This is repeated `shifts` time and + all predictions are averaged. This effectively makes the model time equivariant + and improves SDR by up to 0.2 points. + split (bool): if True, the input will be broken down in 8 seconds extracts + and predictions will be performed individually on each and concatenated. + Useful for model with large memory footprint like Tasnet. + progress (bool): if True, show a progress bar (requires split=True) + """ + + channels, length = mix.size() + device = mix.device + progress_value = 0 + + if split: + out = th.zeros(4, channels, length, device=device) + shift = model.samplerate * 10 + offsets = range(0, length, shift) + scale = 10 + if progress: + offsets = tqdm.tqdm(offsets, unit_scale=scale, ncols=120, unit='seconds') + for offset in offsets: + chunk = mix[..., offset:offset + shift] + if set_progress_bar: + progress_value += 1 + set_progress_bar(0.1, (0.8/len(offsets)*progress_value)) + chunk_out = apply_model_v1(model, chunk, shifts=shifts, set_progress_bar=set_progress_bar) + else: + chunk_out = apply_model_v1(model, chunk, shifts=shifts) + out[..., offset:offset + shift] = chunk_out + offset += shift + return out + elif shifts: + max_shift = int(model.samplerate / 2) + mix = F.pad(mix, (max_shift, max_shift)) + offsets = list(range(max_shift)) + random.shuffle(offsets) + out = 0 + for offset in offsets[:shifts]: + shifted = mix[..., offset:offset + length + max_shift] + if set_progress_bar: + shifted_out = apply_model_v1(model, shifted, set_progress_bar=set_progress_bar) + else: + shifted_out = apply_model_v1(model, shifted) + out += shifted_out[..., max_shift - offset:max_shift - offset + length] + out /= shifts + return out + else: + valid_length = model.valid_length(length) + delta = valid_length - length + padded = F.pad(mix, (delta // 2, delta - delta // 2)) + with th.no_grad(): + out = model(padded.unsqueeze(0))[0] + return center_trim(out, mix) + +def apply_model_v2(model, mix, shifts=None, split=False, + overlap=0.25, transition_power=1., progress=False, set_progress_bar=None): + """ + Apply model to a given mixture. + + Args: + shifts (int): if > 0, will shift in time `mix` by a random amount between 0 and 0.5 sec + and apply the oppositve shift to the output. This is repeated `shifts` time and + all predictions are averaged. This effectively makes the model time equivariant + and improves SDR by up to 0.2 points. + split (bool): if True, the input will be broken down in 8 seconds extracts + and predictions will be performed individually on each and concatenated. + Useful for model with large memory footprint like Tasnet. + progress (bool): if True, show a progress bar (requires split=True) + """ + + assert transition_power >= 1, "transition_power < 1 leads to weird behavior." + device = mix.device + channels, length = mix.shape + progress_value = 0 + + if split: + out = th.zeros(len(model.sources), channels, length, device=device) + sum_weight = th.zeros(length, device=device) + segment = model.segment_length + stride = int((1 - overlap) * segment) + offsets = range(0, length, stride) + scale = stride / model.samplerate + if progress: + offsets = tqdm.tqdm(offsets, unit_scale=scale, ncols=120, unit='seconds') + # We start from a triangle shaped weight, with maximal weight in the middle + # of the segment. Then we normalize and take to the power `transition_power`. + # Large values of transition power will lead to sharper transitions. + weight = th.cat([th.arange(1, segment // 2 + 1), + th.arange(segment - segment // 2, 0, -1)]).to(device) + assert len(weight) == segment + # If the overlap < 50%, this will translate to linear transition when + # transition_power is 1. + weight = (weight / weight.max())**transition_power + for offset in offsets: + chunk = TensorChunk(mix, offset, segment) + if set_progress_bar: + progress_value += 1 + set_progress_bar(0.1, (0.8/len(offsets)*progress_value)) + chunk_out = apply_model_v2(model, chunk, shifts=shifts, set_progress_bar=set_progress_bar) + else: + chunk_out = apply_model_v2(model, chunk, shifts=shifts) + chunk_length = chunk_out.shape[-1] + out[..., offset:offset + segment] += weight[:chunk_length] * chunk_out + sum_weight[offset:offset + segment] += weight[:chunk_length] + offset += segment + assert sum_weight.min() > 0 + out /= sum_weight + return out + elif shifts: + max_shift = int(0.5 * model.samplerate) + mix = tensor_chunk(mix) + padded_mix = mix.padded(length + 2 * max_shift) + out = 0 + for _ in range(shifts): + offset = random.randint(0, max_shift) + shifted = TensorChunk(padded_mix, offset, length + max_shift - offset) + + if set_progress_bar: + progress_value += 1 + shifted_out = apply_model_v2(model, shifted, set_progress_bar=set_progress_bar) + else: + shifted_out = apply_model_v2(model, shifted) + out += shifted_out[..., max_shift - offset:] + out /= shifts + return out + else: + valid_length = model.valid_length(length) + mix = tensor_chunk(mix) + padded_mix = mix.padded(valid_length) + with th.no_grad(): + out = model(padded_mix.unsqueeze(0))[0] + return center_trim(out, length) + + +@contextmanager +def temp_filenames(count, delete=True): + names = [] + try: + for _ in range(count): + names.append(tempfile.NamedTemporaryFile(delete=False).name) + yield names + finally: + if delete: + for name in names: + os.unlink(name) + + +def get_quantizer(model, args, optimizer=None): + quantizer = None + if args.diffq: + quantizer = DiffQuantizer( + model, min_size=args.q_min_size, group_size=8) + if optimizer is not None: + quantizer.setup_optimizer(optimizer) + elif args.qat: + quantizer = UniformQuantizer( + model, bits=args.qat, min_size=args.q_min_size) + return quantizer + + +def load_model(path, strict=False): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + load_from = path + package = th.load(load_from, 'cpu') + + klass = package["klass"] + args = package["args"] + kwargs = package["kwargs"] + + if strict: + model = klass(*args, **kwargs) + else: + sig = inspect.signature(klass) + for key in list(kwargs): + if key not in sig.parameters: + warnings.warn("Dropping inexistant parameter " + key) + del kwargs[key] + model = klass(*args, **kwargs) + + state = package["state"] + training_args = package["training_args"] + quantizer = get_quantizer(model, training_args) + + set_state(model, quantizer, state) + return model + + +def get_state(model, quantizer): + if quantizer is None: + state = {k: p.data.to('cpu') for k, p in model.state_dict().items()} + else: + state = quantizer.get_quantized_state() + buf = io.BytesIO() + th.save(state, buf) + state = {'compressed': zlib.compress(buf.getvalue())} + return state + + +def set_state(model, quantizer, state): + if quantizer is None: + model.load_state_dict(state) + else: + buf = io.BytesIO(zlib.decompress(state["compressed"])) + state = th.load(buf, "cpu") + quantizer.restore_quantized_state(state) + + return state + + +def save_state(state, path): + buf = io.BytesIO() + th.save(state, buf) + sig = hashlib.sha256(buf.getvalue()).hexdigest()[:8] + + path = path.parent / (path.stem + "-" + sig + path.suffix) + path.write_bytes(buf.getvalue()) + + +def save_model(model, quantizer, training_args, path): + args, kwargs = model._init_args_kwargs + klass = model.__class__ + + state = get_state(model, quantizer) + + save_to = path + package = { + 'klass': klass, + 'args': args, + 'kwargs': kwargs, + 'state': state, + 'training_args': training_args, + } + th.save(package, save_to) + + +def capture_init(init): + @functools.wraps(init) + def __init__(self, *args, **kwargs): + self._init_args_kwargs = (args, kwargs) + init(self, *args, **kwargs) + + return __init__ + +class DummyPoolExecutor: + class DummyResult: + def __init__(self, func, *args, **kwargs): + self.func = func + self.args = args + self.kwargs = kwargs + + def result(self): + return self.func(*self.args, **self.kwargs) + + def __init__(self, workers=0): + pass + + def submit(self, func, *args, **kwargs): + return DummyPoolExecutor.DummyResult(func, *args, **kwargs) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, exc_tb): + return diff --git a/poetry.lock b/poetry.lock index cab3ae6..14641d8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,15 @@ # This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +[[package]] +name = "antlr4-python3-runtime" +version = "4.9.3" +description = "ANTLR 4.9.3 runtime for Python 3.7" +optional = false +python-versions = "*" +files = [ + {file = "antlr4-python3-runtime-4.9.3.tar.gz", hash = "sha256:f224469b4168294902bb1efa80a8bf7855f24c99aef99cbefc1bcd3cce77881b"}, +] + [[package]] name = "audioread" version = "3.0.1" @@ -343,6 +353,73 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1 [package.extras] toml = ["tomli"] +[[package]] +name = "cython" +version = "3.0.8" +description = "The Cython compiler for writing C extensions in the Python language." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "Cython-3.0.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a846e0a38e2b24e9a5c5dc74b0e54c6e29420d88d1dafabc99e0fc0f3e338636"}, + {file = "Cython-3.0.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45523fdc2b78d79b32834cc1cc12dc2ca8967af87e22a3ee1bff20e77c7f5520"}, + {file = "Cython-3.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa0b7f3f841fe087410cab66778e2d3fb20ae2d2078a2be3dffe66c6574be39"}, + {file = "Cython-3.0.8-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e87294e33e40c289c77a135f491cd721bd089f193f956f7b8ed5aa2d0b8c558f"}, + {file = "Cython-3.0.8-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:a1df7a129344b1215c20096d33c00193437df1a8fcca25b71f17c23b1a44f782"}, + {file = "Cython-3.0.8-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:13c2a5e57a0358da467d97667297bf820b62a1a87ae47c5f87938b9bb593acbd"}, + {file = "Cython-3.0.8-cp310-cp310-win32.whl", hash = "sha256:96b028f044f5880e3cb18ecdcfc6c8d3ce9d0af28418d5ab464509f26d8adf12"}, + {file = "Cython-3.0.8-cp310-cp310-win_amd64.whl", hash = "sha256:8140597a8b5cc4f119a1190f5a2228a84f5ca6d8d9ec386cfce24663f48b2539"}, + {file = "Cython-3.0.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aae26f9663e50caf9657148403d9874eea41770ecdd6caf381d177c2b1bb82ba"}, + {file = "Cython-3.0.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:547eb3cdb2f8c6f48e6865d5a741d9dd051c25b3ce076fbca571727977b28ac3"}, + {file = "Cython-3.0.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a567d4b9ba70b26db89d75b243529de9e649a2f56384287533cf91512705bee"}, + {file = "Cython-3.0.8-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:51d1426263b0e82fb22bda8ea60dc77a428581cc19e97741011b938445d383f1"}, + {file = "Cython-3.0.8-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c26daaeccda072459b48d211415fd1e5507c06bcd976fa0d5b8b9f1063467d7b"}, + {file = "Cython-3.0.8-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:289ce7838208211cd166e975865fd73b0649bf118170b6cebaedfbdaf4a37795"}, + {file = "Cython-3.0.8-cp311-cp311-win32.whl", hash = "sha256:c8aa05f5e17f8042a3be052c24f2edc013fb8af874b0bf76907d16c51b4e7871"}, + {file = "Cython-3.0.8-cp311-cp311-win_amd64.whl", hash = "sha256:000dc9e135d0eec6ecb2b40a5b02d0868a2f8d2e027a41b0fe16a908a9e6de02"}, + {file = "Cython-3.0.8-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:90d3fe31db55685d8cb97d43b0ec39ef614fcf660f83c77ed06aa670cb0e164f"}, + {file = "Cython-3.0.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e24791ddae2324e88e3c902a765595c738f19ae34ee66bfb1a6dac54b1833419"}, + {file = "Cython-3.0.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f020fa1c0552052e0660790b8153b79e3fc9a15dbd8f1d0b841fe5d204a6ae6"}, + {file = "Cython-3.0.8-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18bfa387d7a7f77d7b2526af69a65dbd0b731b8d941aaff5becff8e21f6d7717"}, + {file = "Cython-3.0.8-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:fe81b339cffd87c0069c6049b4d33e28bdd1874625ee515785bf42c9fdff3658"}, + {file = "Cython-3.0.8-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:80fd94c076e1e1b1ee40a309be03080b75f413e8997cddcf401a118879863388"}, + {file = "Cython-3.0.8-cp312-cp312-win32.whl", hash = "sha256:85077915a93e359a9b920280d214dc0cf8a62773e1f3d7d30fab8ea4daed670c"}, + {file = "Cython-3.0.8-cp312-cp312-win_amd64.whl", hash = "sha256:0cb2dcc565c7851f75d496f724a384a790fab12d1b82461b663e66605bec429a"}, + {file = "Cython-3.0.8-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:870d2a0a7e3cbd5efa65aecdb38d715ea337a904ea7bb22324036e78fb7068e7"}, + {file = "Cython-3.0.8-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7e8f2454128974905258d86534f4fd4f91d2f1343605657ecab779d80c9d6d5e"}, + {file = "Cython-3.0.8-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1949d6aa7bc792554bee2b67a9fe41008acbfe22f4f8df7b6ec7b799613a4b3"}, + {file = "Cython-3.0.8-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9f2c6e1b8f3bcd6cb230bac1843f85114780bb8be8614855b1628b36bb510e0"}, + {file = "Cython-3.0.8-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:05d7eddc668ae7993643f32c7661f25544e791edb745758672ea5b1a82ecffa6"}, + {file = "Cython-3.0.8-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:bfabe115deef4ada5d23c87bddb11289123336dcc14347011832c07db616dd93"}, + {file = "Cython-3.0.8-cp36-cp36m-win32.whl", hash = "sha256:0c38c9f0bcce2df0c3347285863621be904ac6b64c5792d871130569d893efd7"}, + {file = "Cython-3.0.8-cp36-cp36m-win_amd64.whl", hash = "sha256:6c46939c3983217d140999de7c238c3141f56b1ea349e47ca49cae899969aa2c"}, + {file = "Cython-3.0.8-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:115f0a50f752da6c99941b103b5cb090da63eb206abbc7c2ad33856ffc73f064"}, + {file = "Cython-3.0.8-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9c0f29246734561c90f36e70ed0506b61aa3d044e4cc4cba559065a2a741fae"}, + {file = "Cython-3.0.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ab75242869ff71e5665fe5c96f3378e79e792fa3c11762641b6c5afbbbbe026"}, + {file = "Cython-3.0.8-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6717c06e9cfc6c1df18543cd31a21f5d8e378a40f70c851fa2d34f0597037abc"}, + {file = "Cython-3.0.8-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:9d3f74388db378a3c6fd06e79a809ed98df3f56484d317b81ee762dbf3c263e0"}, + {file = "Cython-3.0.8-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ae7ac561fd8253a9ae96311e91d12af5f701383564edc11d6338a7b60b285a6f"}, + {file = "Cython-3.0.8-cp37-cp37m-win32.whl", hash = "sha256:97b2a45845b993304f1799664fa88da676ee19442b15fdcaa31f9da7e1acc434"}, + {file = "Cython-3.0.8-cp37-cp37m-win_amd64.whl", hash = "sha256:9e2be2b340fea46fb849d378f9b80d3c08ff2e81e2bfbcdb656e2e3cd8c6b2dc"}, + {file = "Cython-3.0.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2cde23c555470db3f149ede78b518e8274853745289c956a0e06ad8d982e4db9"}, + {file = "Cython-3.0.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7990ca127e1f1beedaf8fc8bf66541d066ef4723ad7d8d47a7cbf842e0f47580"}, + {file = "Cython-3.0.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b983c8e6803f016146c26854d9150ddad5662960c804ea7f0c752c9266752f0"}, + {file = "Cython-3.0.8-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a973268d7ca1a2bdf78575e459a94a78e1a0a9bb62a7db0c50041949a73b02ff"}, + {file = "Cython-3.0.8-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:61a237bc9dd23c7faef0fcfce88c11c65d0c9bb73c74ccfa408b3a012073c20e"}, + {file = "Cython-3.0.8-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:3a3d67f079598af49e90ff9655bf85bd358f093d727eb21ca2708f467c489cae"}, + {file = "Cython-3.0.8-cp38-cp38-win32.whl", hash = "sha256:17a642bb01a693e34c914106566f59844b4461665066613913463a719e0dd15d"}, + {file = "Cython-3.0.8-cp38-cp38-win_amd64.whl", hash = "sha256:2cdfc32252f3b6dc7c94032ab744dcedb45286733443c294d8f909a4854e7f83"}, + {file = "Cython-3.0.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa97893d99385386925d00074654aeae3a98867f298d1e12ceaf38a9054a9bae"}, + {file = "Cython-3.0.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f05c0bf9d085c031df8f583f0d506aa3be1692023de18c45d0aaf78685bbb944"}, + {file = "Cython-3.0.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de892422582f5758bd8de187e98ac829330ec1007bc42c661f687792999988a7"}, + {file = "Cython-3.0.8-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:314f2355a1f1d06e3c431eaad4708cf10037b5e91e4b231d89c913989d0bdafd"}, + {file = "Cython-3.0.8-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:78825a3774211e7d5089730f00cdf7f473042acc9ceb8b9eeebe13ed3a5541de"}, + {file = "Cython-3.0.8-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:df8093deabc55f37028190cf5e575c26aad23fc673f34b85d5f45076bc37ce39"}, + {file = "Cython-3.0.8-cp39-cp39-win32.whl", hash = "sha256:1aca1b97e0095b3a9a6c33eada3f661a4ed0d499067d121239b193e5ba3bb4f0"}, + {file = "Cython-3.0.8-cp39-cp39-win_amd64.whl", hash = "sha256:16873d78be63bd38ffb759da7ab82814b36f56c769ee02b1d5859560e4c3ac3c"}, + {file = "Cython-3.0.8-py2.py3-none-any.whl", hash = "sha256:171b27051253d3f9108e9759e504ba59ff06e7f7ba944457f94deaf9c21bf0b6"}, + {file = "Cython-3.0.8.tar.gz", hash = "sha256:8333423d8fd5765e7cceea3a9985dd1e0a5dfeb2734629e1a2ed2d6233d39de6"}, +] + [[package]] name = "decorator" version = "5.1.1" @@ -354,6 +431,61 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "diffq" +version = "0.2.4" +description = "Differentiable quantization framework for PyTorch." +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "diffq-0.2.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3d8e6d6b882dd93568b41a7da9ff9657845ec08c82e71460544d0d04ed112320"}, + {file = "diffq-0.2.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:15d5055ebfc629914689d66fcfa36f6d751fd45b4b2331ba0d3390604e2b40fa"}, + {file = "diffq-0.2.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cf990911fc2c932e505df9958ce50417f182fe9178b2dbe329173b29e6521727"}, + {file = "diffq-0.2.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d62ffd89498611dbf32cde417f5fc8d222f450085b06afb4e8764307906ab2ca"}, + {file = "diffq-0.2.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:71a374573ec064227665208a5892d88032cb18736f68560d5522e0c48138ced1"}, + {file = "diffq-0.2.4-cp310-cp310-win32.whl", hash = "sha256:1480ea49785dfdcc793a3f20043aba430f510d51487063e3617020f5a2d2a753"}, + {file = "diffq-0.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:3467622841b15ece3d953fa42ad65f41703afe30e777bb910b96c89125174c8e"}, + {file = "diffq-0.2.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:3aef094383d39e12508ddf61c45a377986b2d4bac26ee553b6504fee10e2ff9d"}, + {file = "diffq-0.2.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0729231949ec74641709ad9b713ba127898735ba20ec8f44677d984d2ce1c3b9"}, + {file = "diffq-0.2.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d6779a81fcb8045d006a5f309c52a34e48fefb73db99232b4b1452a8829c083c"}, + {file = "diffq-0.2.4-cp37-cp37m-win32.whl", hash = "sha256:36f568bb1bbf75ac5601115e6253828c8c7b21a0501d7fcdc3b9545f80dc74f7"}, + {file = "diffq-0.2.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ee7f31d56f5131c2577dfa7bdb7d7284c5cce031fca0e30ac10d248b3e0e6841"}, + {file = "diffq-0.2.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:de84917882a3def0d71548e5366813f7e25a7b110d2085fe0b0fa4c9877f5098"}, + {file = "diffq-0.2.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8937f15e45464fd899e9b3a6b1b57700c977367caf50a626f872dbb7883e3c1c"}, + {file = "diffq-0.2.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8ff6ff92f3978770b0d2d355a5e553a554ea22732236cda9171c0683e5da5577"}, + {file = "diffq-0.2.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0b0cf50749d1a80fb74ee9e5135e08f06b519ab57f5b725bb374b2866412b725"}, + {file = "diffq-0.2.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0381dedef3b88b4910eedfd894610ecac9affd689c2191c96694b70b3366dd27"}, + {file = "diffq-0.2.4-cp38-cp38-win32.whl", hash = "sha256:e793fed11642fdb4909efc0f87f2dbb52808dc5fa9865d7fcadb93ec5fd3aacd"}, + {file = "diffq-0.2.4-cp38-cp38-win_amd64.whl", hash = "sha256:7566f823bda2f3f786dfea1ca2cfaa9663b854ccafcb1b185a4370690b628cf5"}, + {file = "diffq-0.2.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:967a30e3a9da922d8705c3dbf44bb82d5b76a3ad49ecaa4c9450d97479fe8a31"}, + {file = "diffq-0.2.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:78124e86f1e208830bfebe744c2321d92c65ba5d7e125d260ed418b4a2fa93df"}, + {file = "diffq-0.2.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ce966eb21bbb983e5bf5957c5832cad57d0968b7c5602da4065c3d1603ef8a95"}, + {file = "diffq-0.2.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:02268472f1646cafe4fb3feaad9ac519f7e65d617871a58b56e71bc552fc8fb3"}, + {file = "diffq-0.2.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a5fc5cf4967d7cea065e75d2044824137ad08f1ccf7571d871cbf03bcf8809bd"}, + {file = "diffq-0.2.4-cp39-cp39-win32.whl", hash = "sha256:71575ca7202628ee1380a993aebee7e15c23ee12a96bc1a4dd1bff023aafdee2"}, + {file = "diffq-0.2.4-cp39-cp39-win_amd64.whl", hash = "sha256:9729121832c9abbbf4c443effe4e2c2952c48170d8c8255d79012d236c8dcd6d"}, + {file = "diffq-0.2.4.tar.gz", hash = "sha256:049064861e974ebf00d0badab8b324c775037371419eda3150985b9d477b5bd2"}, +] + +[package.dependencies] +Cython = "*" +numpy = "*" +torch = "*" + +[package.extras] +dev = ["coverage", "flake8", "pdoc3", "torchvision"] + +[[package]] +name = "einops" +version = "0.7.0" +description = "A new flavour of deep learning operations" +optional = false +python-versions = ">=3.8" +files = [ + {file = "einops-0.7.0-py3-none-any.whl", hash = "sha256:0f3096f26b914f465f6ff3c66f5478f9a5e380bb367ffc6493a68143fbbf1fd1"}, + {file = "einops-0.7.0.tar.gz", hash = "sha256:b2b04ad6081a3b227080c9bf5e3ace7160357ff03043cd66cc5b2319eb7031d1"}, +] + [[package]] name = "exceptiongroup" version = "1.2.0" @@ -494,6 +626,22 @@ files = [ {file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"}, ] +[[package]] +name = "julius" +version = "0.2.7" +description = "Nice DSP sweets: resampling, FFT Convolutions. All with PyTorch, differentiable and with CUDA support." +optional = false +python-versions = ">=3.6.0" +files = [ + {file = "julius-0.2.7.tar.gz", hash = "sha256:3c0f5f5306d7d6016fcc95196b274cae6f07e2c9596eed314e4e7641554fbb08"}, +] + +[package.dependencies] +torch = ">=1.7.0" + +[package.extras] +dev = ["coverage", "flake8", "mypy", "onnxruntime", "pdoc3", "resampy (==0.2.2)"] + [[package]] name = "lazy-loader" version = "0.3" @@ -970,6 +1118,21 @@ files = [ {file = "nvidia_nvtx_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:65f4d98982b31b60026e0e6de73fbdfc09d08a96f4656dd3665ca616a11e1e82"}, ] +[[package]] +name = "omegaconf" +version = "2.3.0" +description = "A flexible configuration library" +optional = false +python-versions = ">=3.6" +files = [ + {file = "omegaconf-2.3.0-py3-none-any.whl", hash = "sha256:7b4df175cdb08ba400f45cae3bdcae7ba8365db4d165fc65fd04b050ab63b46b"}, + {file = "omegaconf-2.3.0.tar.gz", hash = "sha256:d5d4b6d29955cc50ad50c46dc269bcd92c6e00f5f90d23ab5fee7bfca4ba4cc7"}, +] + +[package.dependencies] +antlr4-python3-runtime = "==4.9.*" +PyYAML = ">=5.1.0" + [[package]] name = "onnx" version = "1.15.0" @@ -1351,6 +1514,66 @@ pytest = ">=4.6" [package.extras] testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtualenv"] +[[package]] +name = "pyyaml" +version = "6.0.1" +description = "YAML parser and emitter for Python" +optional = false +python-versions = ">=3.6" +files = [ + {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"}, + {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, + {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, + {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, + {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, + {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, + {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, + {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, + {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"}, + {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"}, + {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"}, + {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"}, + {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"}, + {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"}, + {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, + {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, + {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, + {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, + {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, + {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, + {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, + {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, +] + [[package]] name = "requests" version = "2.31.0" @@ -1766,4 +1989,4 @@ gpu = ["onnxruntime-gpu"] [metadata] lock-version = "2.0" python-versions = ">=3.9" -content-hash = "1461d00cd955dbeaf3eb7b72eaf816c3eabe8314742f426ce5a89e23687f6950" +content-hash = "3d76f9bb1f9f50cdc5738661b42f71b699b07d6c349db1a9dd566bbc3e9e0082" diff --git a/pyproject.toml b/pyproject.toml index c1485c6..4666dcb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,10 @@ onnx = ">=1.14" onnx2torch = ">=1.5" onnxruntime = { version = ">=1.17", optional = true } onnxruntime-gpu = { version = ">=1.17", optional = true } +julius = "^0.2.7" +omegaconf = "^2.3.0" +diffq = "^0.2.4" +einops = "^0.7.0" [tool.poetry.extras] cpu = ["onnxruntime"]