-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathhyperparameter_search_option1.py
86 lines (75 loc) · 3.23 KB
/
hyperparameter_search_option1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from skopt import Optimizer
import subprocess
import time
import mpiLAPI
import numpy as np
import json
import base_model
def get_fom_from_json_file(fname):
with open(fname) as f:
hist = json.load(f)
fom = hist["history"]["0"]["val_loss"][-1]
print("fom: " + str(fom))
return float(fom)
def train_model_async(json_model, n_nodes=3, n_epochs=3, trial_id=0, get_output=True, model_name='cnn_arch'):
print("Starting MPI process asynchronously.")
mlapi = mpiLAPI.mpi_learn_api(model=json_model,
train_pattern='/bigdata/shared/LCDJets_Remake/train/04*.h5',
val_pattern='/bigdata/shared/LCDJets_Remake/val/020*.h5',
check_file=True,
nohash=True,
model_name=model_name
)
p = mlapi.train_async(get_output=get_output, N=n_nodes,
trial_name = str(trial_id),
features_name = 'Images',
labels_name = 'Labels',
batch = 4,
epoch = n_epochs,
verbose = True,
loss = 'categorical_crossentropy',
easgd = False,
early_stopping = 5
)
time.sleep(5) # don't overload the cluster with mpi_run calls
return p
def get_finished_process(proc_list):
print("waiting for process to stop training")
while True:
for run_id, params, proc in proc_list:
if proc.poll() == 0:
print("process finished: " + str(run_id))
running_proc.remove((run_id, params, proc))
return (run_id, params, proc)
elif proc.poll() != 0 and proc.poll() is not None:
print("process finished with errors: " + str(run_id))
running_proc.remove((run_id, params, proc))
time.sleep(10)
if __name__ == '__main__':
os.environ["CUDA_VISIBLE_DEVICES"]="0,3,4,5"
n_trials = 5
concurrent_train = 3
model = base_model.FunctionalModel()
param_grid = model.get_parameter_grid()
bayesian_opt = Optimizer(param_grid)
running_proc = []
run_id = 0
while run_id < n_trials:
#print("new run: " + str(run_id))
while len(running_proc) < concurrent_train:
print("new run: " + str(run_id))
suggested = bayesian_opt.ask()
print(suggested)
p_i = train_model_async(model.build(suggested), n_epochs=3, n_nodes=2, trial_id=run_id,
get_output=True, model_name=model.get_name())
print((run_id, suggested, p_i))
running_proc.append((run_id, suggested, p_i))
run_id += 1
# infinite loop until some configuration to finish
run_completed, par_i, proc_id = get_finished_process(running_proc)
running_proc.remove((run_completed, par_i, proc_id)) # remove completed process
fname = '_'.join([model.get_name(), str(run_completed), "history.json"])
y = get_fom_from_json_file(fname)
result = bayesian_opt.tell(suggested, y)
print('completed iteration:', run_completed, par_i, y)
print("Best parameters: " + str(result.x))