-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimplement_model.py
151 lines (112 loc) · 5.37 KB
/
implement_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import os
import csv
import datetime
import numpy as np
import sounddevice as sd
import soundfile as sf
import librosa
from keras.models import load_model
import feature_extraction_scripts.feature_extraction_functions as featfun
import feature_extraction_scripts.prep_noise as pn
def get_date():
time = datetime.datetime.now()
time_str = "{}d{}h{}m{}s".format(time.day,time.hour,time.minute,time.second)
return(time_str)
def record_sound(sec,message):
sr = 16000
print(message+" for {} seconds..".format(sec))
sound = sd.rec(int(sec*sr),samplerate=sr,channels=1)
sd.wait()
return sound, sr
def str2bool(bool_string):
bool_string = bool_string=="True"
return bool_string
def main(project_head_folder,model_name):
head_folder_beg = "./ml_speech_projects/"
head_folder_curr_project = head_folder_beg+project_head_folder
#load the information related to features and model of interest
features_info_path = head_folder_curr_project+"/features_log.csv"
encoded_label_path = head_folder_curr_project+"/labels_encoded.csv"
model_path = head_folder_curr_project+"/models/{}.h5".format(model_name)
model_log_path = head_folder_curr_project+"/model_logs/{}.csv".format(model_name)
#find out the settings for feature extraction
with open(features_info_path, mode='r') as infile:
reader = csv.reader(infile)
feats_dict = {rows[0]:rows[1] for rows in reader}
feature_type = feats_dict['features']
num_filters = int(feats_dict['num original features'])
num_feature_columns = int(feats_dict['num total features'])
delta = str2bool(feats_dict["delta"])
dom_freq = str2bool(feats_dict["dominant frequency"])
noise = str2bool(feats_dict["noise"])
vad = str2bool(feats_dict["beginning silence removal"])
timesteps = int(feats_dict['timesteps'])
context_window = int(feats_dict['context window'])
frame_width = context_window*2+1
#prepare the dictionary to find out the assigned label
with open(encoded_label_path, mode='r') as infile:
reader = csv.reader(infile)
dict_labels_encoded = {rows[0]:rows[1] for rows in reader}
print("\nAvailable labels:")
for key, value in dict_labels_encoded.items():
print(value)
#collect new speech
noise, sr = record_sound(4,"Recording background noise")
speech,sr = record_sound(4,"Please say *loud and clear* one of the target words. \nRecording")
#save sound
recording_folder = "{}/recordings".format(head_folder_curr_project)
if not os.path.exists(recording_folder):
os.makedirs(recording_folder)
timestamp = get_date()
noise_filename = "{}/noise_{}.wav".format(recording_folder,timestamp)
sf.write(noise_filename,noise,sr)
speech_filename = "{}/speech_{}.wav".format(recording_folder,timestamp)
sf.write(speech_filename,speech,sr)
y_speech, sr = librosa.load(speech_filename,sr=sr)
y_noise, sr = librosa.load(noise_filename,sr=sr)
speech_rd = pn.rednoise(y_speech,y_noise,sr)
speech_rd_filename = "{}/speech_noisereduced_{}.wav".format(recording_folder,timestamp)
sf.write(speech_rd_filename,speech_rd,sr)
features = featfun.coll_feats_manage_timestep(timesteps,frame_width,speech_filename,feature_type,num_filters,num_feature_columns,recording_folder,delta=delta,dom_freq=dom_freq,noise_wavefile=None,vad=vad)
features2 = featfun.coll_feats_manage_timestep(timesteps,frame_width,speech_rd_filename,feature_type,num_filters,num_feature_columns,recording_folder,delta=delta,dom_freq=dom_freq,noise_wavefile=None,vad=vad)
#need to reshape data for various models..
#find out which models:
with open(model_log_path, mode='r') as infile:
reader = csv.reader(infile)
dict_model_settings = {rows[0]:rows[1] for rows in reader}
model_type = dict_model_settings["model type"]
activation_output = dict_model_settings["activation output"]
X = features
if model_type == "lstm":
X = X.reshape((timesteps,frame_width,X.shape[1]))
elif model_type == "cnn":
X = X.reshape((X.shape[0],X.shape[1],1))
X = X.reshape((1,)+X.shape)
elif model_type == "cnnlstm":
X = X.reshape((timesteps,frame_width,X.shape[1],1))
X = X.reshape((1,)+X.shape)
#load model
model = load_model(model_path)
prediction = model.predict(X)
pred = str(np.argmax(prediction[0]))
label = dict_labels_encoded[pred]
print("Label without noise reduction: {}".format(label))
X = features2
if model_type == "lstm":
X = X.reshape((timesteps,frame_width,X.shape[1]))
elif model_type == "cnn":
X = X.reshape((X.shape[0],X.shape[1],1))
X = X.reshape((1,)+X.shape)
elif model_type == "cnnlstm":
X = X.reshape((timesteps,frame_width,X.shape[1],1))
X = X.reshape((1,)+X.shape)
prediction = model.predict(X)
# show the inputs and predicted outputs
pred = str(np.argmax(prediction[0]))
label = dict_labels_encoded[pred]
print("Label with noise reduction: {}".format(label))
return None
if __name__=="__main__":
project_head_folder = "fbank_models_4d13h39m31s"
model_name = "CNNLSTM_speech_commands_4d13h41m48s"
main(project_head_folder,model_name)