diff --git a/README.md b/README.md index f7028c2..17e98c3 100644 --- a/README.md +++ b/README.md @@ -12,35 +12,27 @@ The repository also includes code for directly converting silent speech to text. The EMG and audio data can be downloaded from . The scripts expect the data to be located in a `emg_data` subdirectory by default, but the location can be overridden with flags (see the top of `read_emg.py`). -Force-aligned phonemes from the Montreal Forced Aligner has been included in the git submodule. -By default, this data is expected to be in a subdirectory `text_alignments`. +Force-aligned phonemes from the Montreal Forced Aligner have been included as a git submodule, which must be updated using the process described in "Environment Setup" below. Note that there will not be an exception if the directory is not found, but logged phoneme prediction accuracies reporting 100% is a sign that the directory has not been loaded correctly. ## Environment Setup -This code requires Python 3.8 or later. -We strongly recommend running in a new Anaconda environment. - -First we will do some conda installs. Your environment must use CUDA 11.8 or later to support RTX 4090. +We strongly recommend running in Anaconda. +To create a new environment with all required dependencies, run: ``` -conda install pytorch==2.0 torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -conda install libsndfile -c conda-forge +conda env create -f environment.yml +conda activate silent_speech ``` +This will install with CUDA 11.8. -HiFi-GAN has been included in the git submodule. (this replaces the WaveNet vocoder that was used in earlier versions). - -The rest of the required packages can be installed with pip or conda. +You will also need to pull git submodules for Hifi-GAN and the phoneme alignment data, using the following commands: ``` -conda install absl-py numpy=1.23 librosa=0.8.1 pysoundfile matplotlib scipy numba unidecode -pip install jiwer==2.2.1 deepspeech==0.9.3 praat-textgrids noisereduce==1.1.0 +git submodule init +git submodule update +tar -xvzf text_alignments/text_alignments.tar.gz ``` -librosa 0.9.0 or later will not support for positional arguments, which will break the related function call. -This version (0.8.1) is not compatible with numpy later than 1.24. - -jiwer 2.3.0 or later will not allow empty strings (see method **compute_measures**) - -Download pre-trained DeepSpeech model files. It is important that you use DeepSpeech version 0.7.0 model files to maintain consistency of evaluation. Note that the DeepSpeech pip package we recommend is version 0.9.3 (which uses a more up-to-date CUDA), but this is compatible with version 0.7.x model files. +Use the following commands to download pre-trained DeepSpeech model files for evaluation. It is important that you use DeepSpeech version 0.7.0 model files for evaluation numbers to be consistent with the original papers. Note that more recent DeepSpeech packages such as version 0.9.3 can be used as long as they are compatible with version 0.7.x model files. ``` curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.7.0/deepspeech-0.7.0-models.pbmm curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.7.0/deepspeech-0.7.0-models.scorer diff --git a/align.py b/align.py index bfd9979..a28857e 100644 --- a/align.py +++ b/align.py @@ -2,7 +2,7 @@ import matplotlib.pyplot as plt from numba import jit -@jit +@jit(nopython=True) def time_warp(costs): dtw = np.zeros_like(costs) dtw[0,1:] = np.inf diff --git a/asr_evaluation.py b/asr_evaluation.py index 7c3083e..e777864 100644 --- a/asr_evaluation.py +++ b/asr_evaluation.py @@ -7,16 +7,17 @@ import numpy as np from unidecode import unidecode import librosa +import tqdm def evaluate(testset, audio_directory): model = deepspeech.Model('deepspeech-0.7.0-models.pbmm') model.enableExternalScorer('deepspeech-0.7.0-models.scorer') predictions = [] targets = [] - for i, datapoint in enumerate(testset): + for i, datapoint in enumerate(tqdm.tqdm(testset, 'Evaluate outputs', disable=None)): audio, rate = sf.read(os.path.join(audio_directory,f'example_output_{i}.wav')) if rate != 16000: - audio = librosa.resample(audio, rate, 16000) + audio = librosa.resample(audio, orig_sr=rate, target_sr=16000) assert model.sampleRate() == 16000, 'wrong sample rate' audio_int16 = (audio*(2**15)).astype(np.int16) text = model.stt(audio_int16) diff --git a/data_collection/clean_audio.py b/data_collection/clean_audio.py index f21af9a..218ab6e 100644 --- a/data_collection/clean_audio.py +++ b/data_collection/clean_audio.py @@ -5,6 +5,7 @@ import noisereduce as nr import soundfile as sf import librosa +import tqdm def clean_directory(directory): silence, rate = sf.read(os.path.join(directory, '0_audio.flac')) @@ -23,9 +24,9 @@ def clean_directory(directory): assert len(audio_file_names) == len(all_audio_file_names), 'error discovering audio files' all_rmses = [] - for fname in audio_file_names: + for fname in tqdm.tqdm(audio_file_names, 'Read for calibration', disable=None): data, rate = sf.read(fname) - rms = librosa.feature.rms(data)[0] + rms = librosa.feature.rms(y=data)[0] all_rmses.append(rms) silent_cutoff = 0.02 @@ -46,12 +47,12 @@ def clean_directory(directory): if is_silent: print('long run of quiet audio, skipping volume normalization') - for i, fname in enumerate(audio_file_names): + for i, fname in enumerate(tqdm.tqdm(audio_file_names, 'Clean data', disable=None)): data, rate = sf.read(fname) - clean = nr.reduce_noise(audio_clip=data, noise_clip=silence) + clean = nr.reduce_noise(y=data, sr=rate, y_noise=silence, stationary=True) if rate != 22050: - clean = librosa.resample(clean, rate, 22050) + clean = librosa.resample(clean, orig_sr=rate, target_sr=22050) rate = 22050 if not is_silent: clean *= target_rms / smoothed_maxes[i] diff --git a/data_utils.py b/data_utils.py index 6479e23..163edb5 100644 --- a/data_utils.py +++ b/data_utils.py @@ -17,7 +17,7 @@ phoneme_inventory = ['aa','ae','ah','ao','aw','ax','axr','ay','b','ch','d','dh','dx','eh','el','em','en','er','ey','f','g','hh','hv','ih','iy','jh','k','l','m','n','nx','ng','ow','oy','p','r','s','sh','t','th','uh','uw','v','w','y','z','zh','sil'] def normalize_volume(audio): - rms = librosa.feature.rms(audio) + rms = librosa.feature.rms(y=audio) max_rms = rms.max() + 0.01 target_rms = 0.2 audio = audio * (target_rms/max_rms) @@ -44,7 +44,7 @@ def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, global mel_basis, hann_window if fmax not in mel_basis: - mel = librosa.filters.mel(sampling_rate, n_fft, num_mels, fmin, fmax) + mel = librosa.filters.mel(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device) hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) @@ -72,7 +72,7 @@ def load_audio(filename, start=None, end=None, max_frames=None, renormalize_volu if renormalize_volume: audio = normalize_volume(audio) if r == 16000: - audio = librosa.resample(audio, 16000, 22050) + audio = librosa.resample(audio, orig_sr=16000, target_sr=22050) else: assert r == 22050 audio = np.clip(audio, -1, 1) # because resampling sometimes pushes things out of range @@ -99,9 +99,9 @@ def get_emg_features(emg_data, debug=False): r = np.abs(p) w_h = librosa.util.frame(w, frame_length=16, hop_length=6).mean(axis=0) - p_w = librosa.feature.rms(w, frame_length=16, hop_length=6, center=False) + p_w = librosa.feature.rms(y=w, frame_length=16, hop_length=6, center=False) p_w = np.squeeze(p_w, 0) - p_r = librosa.feature.rms(r, frame_length=16, hop_length=6, center=False) + p_r = librosa.feature.rms(y=r, frame_length=16, hop_length=6, center=False) p_r = np.squeeze(p_r, 0) z_p = librosa.feature.zero_crossing_rate(p, frame_length=16, hop_length=6, center=False) z_p = np.squeeze(z_p, 0) diff --git a/environment.yml b/environment.yml index 427a85d..1d54660 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: dgaddy +name: silent_speech channels: - conda-forge - pytorch @@ -13,16 +13,17 @@ dependencies: - pytorch-cuda=11.8 - libsndfile - absl-py - - numpy=1.23 - - librosa=0.8.1 + - numpy + - librosa - pysoundfile - matplotlib - scipy - numba - unidecode + - tqdm - pip - pip: - jiwer==2.2.1 - deepspeech==0.9.3 - praat-textgrids - - noisereduce==1.1.0 + - noisereduce diff --git a/evaluate.py b/evaluate.py index e1fbb8f..4427c37 100644 --- a/evaluate.py +++ b/evaluate.py @@ -2,6 +2,8 @@ import os import logging +import tqdm + import torch from torch import nn @@ -56,7 +58,7 @@ def main(): vocoder = Vocoder() - for i, datapoint in enumerate(testset): + for i, datapoint in enumerate(tqdm.tqdm(testset, 'Generate outputs', disable=None)): save_output(ensemble, datapoint, os.path.join(FLAGS.output_directory, f'example_output_{i}.wav'), device, testset.mfcc_norm, vocoder) evaluate(testset, FLAGS.output_directory) diff --git a/make_vocoder_trainset.py b/make_vocoder_trainset.py index 6554aa7..385fe69 100644 --- a/make_vocoder_trainset.py +++ b/make_vocoder_trainset.py @@ -38,7 +38,7 @@ def main(): np.save(os.path.join(FLAGS.output_directory, 'mels', f'{name_prefix}_output_{i}.npy'), spec) audio, r = sf.read(datapoint['audio_file']) if r != 22050: - audio = librosa.resample(audio, r, 22050, res_type='kaiser_fast') + audio = librosa.resample(audio, orig_sr=r, target_sr=22050, res_type='kaiser_fast') audio = np.clip(audio, -1, 1) # because resampling sometimes pushes things out of range sf.write(os.path.join(FLAGS.output_directory, 'wavs', f'{name_prefix}_output_{i}.wav'), audio, 22050) filelist.write(f'{name_prefix}_output_{i}\n') diff --git a/recognition_model.py b/recognition_model.py index 2713697..81c0352 100644 --- a/recognition_model.py +++ b/recognition_model.py @@ -5,6 +5,7 @@ import subprocess from ctcdecode import CTCBeamDecoder import jiwer +import tqdm import torch from torch import nn @@ -37,7 +38,7 @@ def test(model, testset, device): references = [] predictions = [] with torch.no_grad(): - for example in dataloader: + for example in tqdm.tqdm(dataloader, 'Evaluate', disable=None): X = example['emg'].to(device) X_raw = example['raw_emg'].to(device) sess = example['session_ids'].to(device) @@ -85,7 +86,7 @@ def schedule_lr(iteration): optim.zero_grad() for epoch_idx in range(n_epochs): losses = [] - for example in dataloader: + for example in tqdm.tqdm(dataloader, 'Train step', disable=None): schedule_lr(batch_idx) X = combine_fixed_length(example['emg'], 200).to(device) diff --git a/transduction_model.py b/transduction_model.py index 4ae5746..96426e9 100755 --- a/transduction_model.py +++ b/transduction_model.py @@ -5,6 +5,7 @@ import subprocess import soundfile as sf +import tqdm import torch import torch.nn.functional as F @@ -38,7 +39,7 @@ def test(model, testset, device): phoneme_confusion = np.zeros((len(phoneme_inventory),len(phoneme_inventory))) seq_len = 200 with torch.no_grad(): - for batch in dataloader: + for batch in tqdm.tqdm(dataloader, 'Validation', disable=None): X = combine_fixed_length([t.to(device, non_blocking=True) for t in batch['emg']], seq_len) X_raw = combine_fixed_length([t.to(device, non_blocking=True) for t in batch['raw_emg']], seq_len*8) sess = combine_fixed_length([t.to(device, non_blocking=True) for t in batch['session_ids']], seq_len) @@ -56,9 +57,9 @@ def test(model, testset, device): def save_output(model, datapoint, filename, device, audio_normalizer, vocoder): model.eval() with torch.no_grad(): - sess = torch.tensor(datapoint['session_ids'], device=device).unsqueeze(0) - X = torch.tensor(datapoint['emg'], dtype=torch.float32, device=device).unsqueeze(0) - X_raw = torch.tensor(datapoint['raw_emg'], dtype=torch.float32, device=device).unsqueeze(0) + sess = datapoint['session_ids'].to(device=device).unsqueeze(0) + X = datapoint['emg'].to(dtype=torch.float32, device=device).unsqueeze(0) + X_raw = datapoint['raw_emg'].to(dtype=torch.float32, device=device).unsqueeze(0) pred, _ = model(X, X_raw, sess) y = pred.squeeze(0) @@ -192,7 +193,7 @@ def schedule_lr(iteration): batch_idx = 0 for epoch_idx in range(n_epochs): losses = [] - for batch in dataloader: + for batch in tqdm.tqdm(dataloader, 'Train step', disable=None): optim.zero_grad() schedule_lr(batch_idx)