-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaudio.py
207 lines (163 loc) · 7.63 KB
/
audio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
from scipy.ndimage.morphology import binary_dilation
from pathlib import Path
from typing import Optional, Union
import numpy as np
import webrtcvad
import librosa
import struct
import soundfile as sf
sf.default_subtype('WAV')
int16_max = (2 ** 15) - 1
from hparams import *
def load_wav(fpath):
"""
laod the wav
:param fpath: path to wav file
:return: waveform,source sample rate
"""
wav, source_sr = librosa.load(fpath, sr=None)
return wav, source_sr
def save_wav(wav, sample_rate, save_path):
"""
save wav to disk,example:
y, sr = librosa.load(librosa.util.example_audio_file(),duration=5.0)
librosa.output.write_wav('file_trim_5s.wav', y, sr)
"""
# ---------------注意:写入的文件采用sox --i查看,精度变了,最好不要用这个保存音频,原因未明
librosa.output.write_wav(save_path, wav, sample_rate)
def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
source_sr: Optional[int] = None):
"""
Applies the preprocessing operations used in training the Speaker Encoder to a waveform
either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
:param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
just .wav), either the waveform as a numpy array of floats.
:param source_sr: if passing an audio waveform, the sampling rate of the waveform before
preprocessing. After preprocessing, the waveform's sampling rate will match the data
hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
this argument will be ignored.
"""
# Load the wav from disk if needed
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
wav, source_sr = librosa.load(fpath_or_wav, sr=None)
else:
wav = fpath_or_wav
# Resample the wav if needed
if source_sr is not None and source_sr != sampling_rate:
wav = librosa.resample(wav, source_sr, sampling_rate)
# Apply the preprocessing: normalize volume and shorten long silences
wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
wav = trim_long_silences(wav)
return wav
def wav_to_mel_spectrogram(wav, sampling_rate):
"""
Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
Note: this not a log-mel spectrogram.
"""
frames = librosa.feature.melspectrogram(
wav,
sampling_rate,
n_fft=int(sampling_rate * mel_window_length / 1000),
hop_length=int(sampling_rate * mel_window_step / 1000),
n_mels=mel_n_channels
)
return frames.astype(np.float32).T
def trim_long_silences(wav):
"""
Ensures that segments without voice in the waveform remain no longer than a
threshold determined by the VAD parameters in params.py.
:param wav: the raw waveform as a numpy array of floats
:return: the same waveform with silences trimmed away (length <= original wav length)
"""
# Compute the voice detection window size
samples_per_window = (vad_window_length * sampling_rate) // 1000
# Trim the end of the audio to have a multiple of the window size
wav = wav[:len(wav) - (len(wav) % samples_per_window)]
# Convert the float waveform to 16-bit mono PCM
pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
# Perform voice activation detection
voice_flags = []
vad = webrtcvad.Vad(mode=3)
for window_start in range(0, len(wav), samples_per_window):
window_end = window_start + samples_per_window
voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
sample_rate=sampling_rate))
voice_flags = np.array(voice_flags)
# Smooth the voice detection with a moving average
def moving_average(array, width):
array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
ret = np.cumsum(array_padded, dtype=float)
ret[width:] = ret[width:] - ret[:-width]
return ret[width - 1:] / width
audio_mask = moving_average(voice_flags, vad_moving_average_width)
audio_mask = np.round(audio_mask).astype(np.bool)
# Dilate the voiced regions
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
audio_mask = np.repeat(audio_mask, samples_per_window)
return wav[audio_mask == True]
def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
if increase_only and decrease_only:
raise ValueError("Both increase only and decrease only are set")
rms = np.sqrt(np.mean((wav * int16_max) ** 2))
wave_dBFS = 20 * np.log10(rms / int16_max)
dBFS_change = target_dBFS - wave_dBFS
if dBFS_change < 0 and increase_only or dBFS_change > 0 and decrease_only:
return wav
return wav * (10 ** (dBFS_change / 20))
def audio_vad_segment(fpath_or_wav, source_sr=None):
"""
根据静音分割音频,特别注意的一点是分割保存后的音频的精度为24-bit,采用linux的sox --i看的时候,是原音频的精度为:16-bit
"""
# Load the wav from disk if needed
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
wav, source_sr = librosa.load(fpath_or_wav, sr=None)
else:
wav = fpath_or_wav
# Resample the wav if needed
if source_sr is not None and source_sr != sampling_rate:
wav = librosa.resample(wav, source_sr, sampling_rate)
# Apply the preprocessing: normalize volume and shorten long silences
wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
# Compute the voice detection window size
samples_per_window = (vad_window_length * sampling_rate) // 1000
# Trim the end of the audio to have a multiple of the window size
wav = wav[:len(wav) - (len(wav) % samples_per_window)]
# Convert the float waveform to 16-bit mono PCM
pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
# Perform voice activation detection
voice_flags = []
vad = webrtcvad.Vad(mode=3)
for window_start in range(0, len(wav), samples_per_window):
window_end = window_start + samples_per_window
voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
sample_rate=sampling_rate))
voice_flags = np.array(voice_flags)
# Smooth the voice detection with a moving average
def moving_average(array, width):
array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
ret = np.cumsum(array_padded, dtype=float)
ret[width:] = ret[width:] - ret[:-width]
return ret[width - 1:] / width
audio_mask = moving_average(voice_flags, vad_moving_average_width)
audio_mask = np.round(audio_mask).astype(np.bool)
# Dilate the voiced regions
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
audio_mask = np.repeat(audio_mask, samples_per_window)
left_start = 0
right_end = 0
up_flag = False
i = 0
segment_wavs_cl = []
for window_id in range(0, len(wav)):
if audio_mask[window_id] and up_flag == False:
left_start = window_id
up_flag = True
if audio_mask[window_id]:
right_end = window_id
if audio_mask[window_id] == False and left_start < right_end:
cur_wav = wav[left_start:right_end]
i += 1
segment_wavs_cl.append(cur_wav)
right_end = left_start
up_flag = False
return segment_wavs_cl