More version updates and cleanup.

dgaddy · Dec 11, 2023 · a89357c · a89357c
1 parent cd96f46
commit a89357c
Show file tree

Hide file tree

Showing 10 changed files with 44 additions and 45 deletions.
diff --git a/README.md b/README.md
@@ -12,35 +12,27 @@ The repository also includes code for directly converting silent speech to text.
 
 The EMG and audio data can be downloaded from <https://doi.org/10.5281/zenodo.4064408>.  The scripts expect the data to be located in a `emg_data` subdirectory by default, but the location can be overridden with flags (see the top of `read_emg.py`).
 
-Force-aligned phonemes from the Montreal Forced Aligner has been included in the git submodule.
-By default, this data is expected to be in a subdirectory `text_alignments`.
+Force-aligned phonemes from the Montreal Forced Aligner have been included as a git submodule, which must be updated using the process described in "Environment Setup" below.
 Note that there will not be an exception if the directory is not found, but logged phoneme prediction accuracies reporting 100% is a sign that the directory has not been loaded correctly.
 
 ## Environment Setup
 
-This code requires Python 3.8 or later.
-We strongly recommend running in a new Anaconda environment.
-
-First we will do some conda installs.  Your environment must use CUDA 11.8 or later to support RTX 4090.
+We strongly recommend running in Anaconda.
+To create a new environment with all required dependencies, run:
 ```
-conda install pytorch==2.0 torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
-conda install libsndfile -c conda-forge
+conda env create -f environment.yml
+conda activate silent_speech
 ```
+This will install with CUDA 11.8.
 
-HiFi-GAN has been included in the git submodule. (this replaces the WaveNet vocoder that was used in earlier versions).
-
-The rest of the required packages can be installed with pip or conda.
+You will also need to pull git submodules for Hifi-GAN and the phoneme alignment data, using the following commands:
 ```
-conda install absl-py numpy=1.23 librosa=0.8.1 pysoundfile matplotlib scipy numba unidecode 
-pip install jiwer==2.2.1 deepspeech==0.9.3 praat-textgrids noisereduce==1.1.0
+git submodule init
+git submodule update
+tar -xvzf text_alignments/text_alignments.tar.gz
 ```
 
-librosa 0.9.0 or later will not support for positional arguments, which will break the related function call.
-This version (0.8.1) is not compatible with numpy later than 1.24.
-
-jiwer 2.3.0 or later will not allow empty strings (see method **compute_measures**)
-
-Download pre-trained DeepSpeech model files.  It is important that you use DeepSpeech version 0.7.0 model files to maintain consistency of evaluation.  Note that the DeepSpeech pip package we recommend is version 0.9.3 (which uses a more up-to-date CUDA), but this is compatible with version 0.7.x model files.
+Use the following commands to download pre-trained DeepSpeech model files for evaluation.  It is important that you use DeepSpeech version 0.7.0 model files for evaluation numbers to be consistent with the original papers.  Note that more recent DeepSpeech packages such as version 0.9.3 can be used as long as they are compatible with version 0.7.x model files.
 ```
 curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.7.0/deepspeech-0.7.0-models.pbmm
 curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.7.0/deepspeech-0.7.0-models.scorer

diff --git a/align.py b/align.py
@@ -2,7 +2,7 @@
 import matplotlib.pyplot as plt
 from numba import jit
 
-@jit
+@jit(nopython=True)
 def time_warp(costs):
     dtw = np.zeros_like(costs)
     dtw[0,1:] = np.inf

diff --git a/asr_evaluation.py b/asr_evaluation.py
@@ -7,16 +7,17 @@
 import numpy as np
 from unidecode import unidecode
 import librosa
+import tqdm
 
 def evaluate(testset, audio_directory):
     model = deepspeech.Model('deepspeech-0.7.0-models.pbmm')
     model.enableExternalScorer('deepspeech-0.7.0-models.scorer')
     predictions = []
     targets = []
-    for i, datapoint in enumerate(testset):
+    for i, datapoint in enumerate(tqdm.tqdm(testset, 'Evaluate outputs', disable=None)):
         audio, rate = sf.read(os.path.join(audio_directory,f'example_output_{i}.wav'))
         if rate != 16000:
-            audio = librosa.resample(audio, rate, 16000)
+            audio = librosa.resample(audio, orig_sr=rate, target_sr=16000)
         assert model.sampleRate() == 16000, 'wrong sample rate'
         audio_int16 = (audio*(2**15)).astype(np.int16)
         text = model.stt(audio_int16)

diff --git a/data_collection/clean_audio.py b/data_collection/clean_audio.py
@@ -5,6 +5,7 @@
 import noisereduce as nr
 import soundfile as sf
 import librosa
+import tqdm
 
 def clean_directory(directory):
     silence, rate = sf.read(os.path.join(directory, '0_audio.flac'))
@@ -23,9 +24,9 @@ def clean_directory(directory):
     assert len(audio_file_names) == len(all_audio_file_names), 'error discovering audio files'
 
     all_rmses = []
-    for fname in audio_file_names:
+    for fname in tqdm.tqdm(audio_file_names, 'Read for calibration', disable=None):
         data, rate = sf.read(fname)
-        rms = librosa.feature.rms(data)[0]
+        rms = librosa.feature.rms(y=data)[0]
         all_rmses.append(rms)
 
     silent_cutoff = 0.02
@@ -46,12 +47,12 @@ def clean_directory(directory):
     if is_silent:
         print('long run of quiet audio, skipping volume normalization')
 
-    for i, fname in enumerate(audio_file_names):
+    for i, fname in enumerate(tqdm.tqdm(audio_file_names, 'Clean data', disable=None)):
         data, rate = sf.read(fname)
 
-        clean = nr.reduce_noise(audio_clip=data, noise_clip=silence)
+        clean = nr.reduce_noise(y=data, sr=rate, y_noise=silence, stationary=True)
         if rate != 22050:
-            clean = librosa.resample(clean, rate, 22050)
+            clean = librosa.resample(clean, orig_sr=rate, target_sr=22050)
             rate = 22050
         if not is_silent:
             clean *= target_rms / smoothed_maxes[i]

diff --git a/data_utils.py b/data_utils.py
@@ -17,7 +17,7 @@
 phoneme_inventory = ['aa','ae','ah','ao','aw','ax','axr','ay','b','ch','d','dh','dx','eh','el','em','en','er','ey','f','g','hh','hv','ih','iy','jh','k','l','m','n','nx','ng','ow','oy','p','r','s','sh','t','th','uh','uw','v','w','y','z','zh','sil']
 
 def normalize_volume(audio):
-    rms = librosa.feature.rms(audio)
+    rms = librosa.feature.rms(y=audio)
     max_rms = rms.max() + 0.01
     target_rms = 0.2
     audio = audio * (target_rms/max_rms)
@@ -44,7 +44,7 @@ def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin,
 
     global mel_basis, hann_window
     if fmax not in mel_basis:
-        mel = librosa.filters.mel(sampling_rate, n_fft, num_mels, fmin, fmax)
+        mel = librosa.filters.mel(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
         mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
         hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
 
@@ -72,7 +72,7 @@ def load_audio(filename, start=None, end=None, max_frames=None, renormalize_volu
     if renormalize_volume:
         audio = normalize_volume(audio)
     if r == 16000:
-        audio = librosa.resample(audio, 16000, 22050)
+        audio = librosa.resample(audio, orig_sr=16000, target_sr=22050)
     else:
         assert r == 22050
     audio = np.clip(audio, -1, 1) # because resampling sometimes pushes things out of range
@@ -99,9 +99,9 @@ def get_emg_features(emg_data, debug=False):
         r = np.abs(p)
 
         w_h = librosa.util.frame(w, frame_length=16, hop_length=6).mean(axis=0)
-        p_w = librosa.feature.rms(w, frame_length=16, hop_length=6, center=False)
+        p_w = librosa.feature.rms(y=w, frame_length=16, hop_length=6, center=False)
         p_w = np.squeeze(p_w, 0)
-        p_r = librosa.feature.rms(r, frame_length=16, hop_length=6, center=False)
+        p_r = librosa.feature.rms(y=r, frame_length=16, hop_length=6, center=False)
         p_r = np.squeeze(p_r, 0)
         z_p = librosa.feature.zero_crossing_rate(p, frame_length=16, hop_length=6, center=False)
         z_p = np.squeeze(z_p, 0)

diff --git a/environment.yml b/environment.yml
@@ -1,4 +1,4 @@
-name: dgaddy
+name: silent_speech
 channels:
   - conda-forge
   - pytorch
@@ -13,16 +13,17 @@ dependencies:
   - pytorch-cuda=11.8
   - libsndfile
   - absl-py
-  - numpy=1.23
-  - librosa=0.8.1
+  - numpy
+  - librosa
   - pysoundfile
   - matplotlib
   - scipy
   - numba
   - unidecode
+  - tqdm
   - pip
   - pip:
     - jiwer==2.2.1
     - deepspeech==0.9.3
     - praat-textgrids
-    - noisereduce==1.1.0
+    - noisereduce
diff --git a/evaluate.py b/evaluate.py
@@ -2,6 +2,8 @@
 import os
 import logging
 
+import tqdm
+
 import torch
 from torch import nn
 
@@ -56,7 +58,7 @@ def main():
 
     vocoder = Vocoder()
 
-    for i, datapoint in enumerate(testset):
+    for i, datapoint in enumerate(tqdm.tqdm(testset, 'Generate outputs', disable=None)):
         save_output(ensemble, datapoint, os.path.join(FLAGS.output_directory, f'example_output_{i}.wav'), device, testset.mfcc_norm, vocoder)
 
     evaluate(testset, FLAGS.output_directory)

diff --git a/make_vocoder_trainset.py b/make_vocoder_trainset.py
@@ -38,7 +38,7 @@ def main():
                 np.save(os.path.join(FLAGS.output_directory, 'mels', f'{name_prefix}_output_{i}.npy'), spec)
                 audio, r = sf.read(datapoint['audio_file'])
                 if r != 22050:
-                    audio = librosa.resample(audio, r, 22050, res_type='kaiser_fast')
+                    audio = librosa.resample(audio, orig_sr=r, target_sr=22050, res_type='kaiser_fast')
                 audio = np.clip(audio, -1, 1) # because resampling sometimes pushes things out of range
                 sf.write(os.path.join(FLAGS.output_directory, 'wavs', f'{name_prefix}_output_{i}.wav'), audio, 22050)
                 filelist.write(f'{name_prefix}_output_{i}\n')

diff --git a/recognition_model.py b/recognition_model.py
@@ -5,6 +5,7 @@
 import subprocess
 from ctcdecode import CTCBeamDecoder
 import jiwer
+import tqdm
 
 import torch
 from torch import nn
@@ -37,7 +38,7 @@ def test(model, testset, device):
     references = []
     predictions = []
     with torch.no_grad():
-        for example in dataloader:
+        for example in tqdm.tqdm(dataloader, 'Evaluate', disable=None):
             X = example['emg'].to(device)
             X_raw = example['raw_emg'].to(device)
             sess = example['session_ids'].to(device)
@@ -85,7 +86,7 @@ def schedule_lr(iteration):
     optim.zero_grad()
     for epoch_idx in range(n_epochs):
         losses = []
-        for example in dataloader:
+        for example in tqdm.tqdm(dataloader, 'Train step', disable=None):
             schedule_lr(batch_idx)
 
             X = combine_fixed_length(example['emg'], 200).to(device)

diff --git a/transduction_model.py b/transduction_model.py
@@ -5,6 +5,7 @@
 import subprocess
 
 import soundfile as sf
+import tqdm
 
 import torch
 import torch.nn.functional as F
@@ -38,7 +39,7 @@ def test(model, testset, device):
     phoneme_confusion = np.zeros((len(phoneme_inventory),len(phoneme_inventory)))
     seq_len = 200
     with torch.no_grad():
-        for batch in dataloader:
+        for batch in tqdm.tqdm(dataloader, 'Validation', disable=None):
             X = combine_fixed_length([t.to(device, non_blocking=True) for t in batch['emg']], seq_len)
             X_raw = combine_fixed_length([t.to(device, non_blocking=True) for t in batch['raw_emg']], seq_len*8)
             sess = combine_fixed_length([t.to(device, non_blocking=True) for t in batch['session_ids']], seq_len)
@@ -56,9 +57,9 @@ def test(model, testset, device):
 def save_output(model, datapoint, filename, device, audio_normalizer, vocoder):
     model.eval()
     with torch.no_grad():
-        sess = torch.tensor(datapoint['session_ids'], device=device).unsqueeze(0)
-        X = torch.tensor(datapoint['emg'], dtype=torch.float32, device=device).unsqueeze(0)
-        X_raw = torch.tensor(datapoint['raw_emg'], dtype=torch.float32, device=device).unsqueeze(0)
+        sess = datapoint['session_ids'].to(device=device).unsqueeze(0)
+        X = datapoint['emg'].to(dtype=torch.float32, device=device).unsqueeze(0)
+        X_raw = datapoint['raw_emg'].to(dtype=torch.float32, device=device).unsqueeze(0)
 
         pred, _ = model(X, X_raw, sess)
         y = pred.squeeze(0)
@@ -192,7 +193,7 @@ def schedule_lr(iteration):
     batch_idx = 0
     for epoch_idx in range(n_epochs):
         losses = []
-        for batch in dataloader:
+        for batch in tqdm.tqdm(dataloader, 'Train step', disable=None):
             optim.zero_grad()
             schedule_lr(batch_idx)