-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain_reinforce.py
339 lines (280 loc) · 11 KB
/
main_reinforce.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
'''
This code implements 3 types for Reignforce algorithm total, without and with.
Total - Will take total reward as the advantages function to guide the policy
network.
Without - Will use total future reward from that state to terminal state as
advantage function
With - It is same as without but uses a baseline that is the average Total
reward of the episode till now
Parameter to tweek: Environment, varient, n_episodes, alpha, gamma, thershold for
MountainCar environment, Save variable to save the plots, weights and rewards(score)
and layers(max u can use 2 layers) - "1_8" for single layer, "2_16_16" for two layers
You can even Load a model, but make sure the layer parameter is set right
'''
import argparse
import gym
import matplotlib.pyplot as plt
import numpy as np
from reignforce import Agent
import pandas as pd
import os
###Caluculation of mean and standard deviation and plot
def plot(episode_rewards, policy, label, alpha, gamma, plot_path):
plt.figure()
plt.suptitle(policy)
plt.title(environment+r"$\alpha $ = "+str(alpha)+r", $\gamma$ = "+str(gamma))
plt.plot(range(len(episode_rewards)),episode_rewards, '.-',label=label)
plt.xlabel('Number of Episodes')
plt.ylabel('Rewards')
plt.legend()
plt.savefig(plot_path+"/" + policy +"Reward.png")
plt.figure()
plt.suptitle(policy)
z1=pd.Series(episode_rewards).rolling(50).mean()
plt.title(environment+r"$\alpha $ = "+str(alpha)+r", $\gamma$ = "+str(gamma)+ ", Best average reward: "+ str(np.max(z1)))
plt.plot(z1,label=label)
plt.xlabel('Number of Episodes')
plt.ylabel('Average Rewards over previous 50 episodes')
plt.legend()
plt.savefig(plot_path+"/" + policy +"cumulative.png")
# plt.show()
def policy_sampling(env,actor,label, alpha, gamma, plot_path,ep=1000):
score_history = []
n_episodes = ep
for i in range(n_episodes):
done = False
score = 0
observation = env.reset()
while not done:
action = actor.choose_action(observation)
observation_,reward, done, info = env.step(action)
observation = observation_
score += reward
score_history.append(score)
print('episode ', i,'score %.1f' % score,
'average_score %.1f' % np.mean(score_history[-50:]))
plot(score_history, "Sampling_Policy",label, alpha, gamma, plot_path)
return [np.mean(score_history), np.std(score_history)]
def policy_max(env,actor,label, alpha, gamma, plot_path,ep=1000):
score_history = []
n_episodes = ep
for i in range(n_episodes):
done = False
score = 0
observation = env.reset()
while not done:
#Get the probability of performing actions
action_prob = actor.policy.predict(observation[np.newaxis, :])
#Get the location(action number) by finding the max position
action = np.argmax(action_prob)
observation_,reward, done, info = env.step(action)
observation = observation_
score += reward
score_history.append(score)
print('episode ', i,'score %.1f' % score,
'average_score %.1f' % np.mean(score_history[-50:]))
plot(score_history,"Max_Policy",label, alpha, gamma, plot_path)
return [np.mean(score_history), np.std(score_history)]
#######################################################################
#Generating Video of an episode
def Generate_episode_sampling(env,actor,path):
path = path + "/sampling/"
if not os.path.exists(path):
os.makedirs(path)
env = gym.wrappers.Monitor(env,path,video_callable=lambda episode_id: episode_id==0,force = True)
done = False
observation = env.reset()
while not done:
action = actor.choose_action(observation)
observation_,reward, done, info = env.step(action)
observation = observation_
def Generate_episode_max(env,actor,path):
path = path + "/max/"
if not os.path.exists(path):
os.makedirs(path)
env = gym.wrappers.Monitor(env,path,video_callable=lambda episode_id: episode_id==0,force = True)
done = False
observation = env.reset()
while not done:
#Get the probability of performing actions
action_prob = actor.policy.predict(observation[np.newaxis, :])
#Get the location(action number) by finding the max position
action = np.argmax(action_prob)
observation_,reward, done, info = env.step(action)
observation = observation_
##########################################
#Plot training
##########################################
def plotLearning(title,alpha,gamma,loss, episode_rewards, path, label, save):
plt.figure()
plt.suptitle(label+" - "+environment)
plt.title(r"$\alpha $ = "+alpha+r", $\gamma$ = "+gamma)
plt.plot(range(len(episode_rewards)),episode_rewards, '.-',label=label)
plt.xlabel('Number of Episodes')
plt.ylabel('Total Rewards')
plt.legend()
if(save):
plt.savefig(path+"/Reward.png")
plt.figure()
plt.suptitle(label+" - "+environment)
plt.title(r"$\alpha$ = "+alpha+r", $\gamma$ = "+gamma)
plt.plot(range(len(loss)),loss, '.-',label=label)
plt.xlabel('Number of Episodes')
plt.ylabel('loss in each episode')
plt.legend()
if(save):
plt.savefig(path+"/loss.png")
plt.figure()
plt.suptitle(variant+" - "+environment)
z1=pd.Series(episode_rewards).rolling(50).mean()
plt.title(r"$\alpha$ = "+alpha +r", $\gamma$ = "+gamma + ", Best average reward: "+ str(np.max(z1)))
plt.plot(z1,label=label)
plt.xlabel('Number of Episodes')
plt.ylabel('Average Rewards over past 50 episodes')
plt.legend()
if(save):
plt.savefig(path+"/cumulative.png")
plt.show()
###########################################
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--load",default="",type=str,help = "Give a .h5 model of weights for policy network")
args = parser.parse_args()
#Whether to save the output
Save = True
types = ["total","without","with"]
#Environment details
envs = {'CartPole': 'CartPole-v0',
'Acrobot': 'Acrobot-v1',
'LunarLander': 'LunarLander-v2',
'MountainCar':'MountainCar-v0'}
environment = "CartPole"
folder = "Reinforce1"
variant = "with"
if variant not in types:
print("This variant doesn't exist")
exit()
if environment in envs:
opengym_env = envs.get(environment, None)
else:
print("Please provide the right environment")
exit()
#Environment Details
env = gym.make(opengym_env)
if(environment == "MountainCar"):
env = gym.make(opengym_env).env
n_actions = env.action_space.n
n_states = len(env.observation_space.low)
##############################
#Policy Netwrok
##############################
alpha = 0.001
layers = "0"
gamma = 0.99
h1_actor = 0
h2_actor = 0
if(layers!='0'):
temp = layers.rsplit("_")
if(temp[0]=='1'):
h1_actor = int(temp[1])
elif(temp[0]=='2'):
h1_actor = int(temp[1])
h2_actor = int(temp[2])
else:
print("Wrong actor network config, or hidden layers greater than 2")
actor = Agent(ALPHA = alpha, input_dims=n_states,
Gamma= gamma, n_actions = n_actions,layer1_size=h1_actor, layer2_size=h2_actor)
if(args.load != ""):
print("---Loading weights---")
actor.load_weights(args.load)
#Number of trajectories to perform
n_episodes = 10
save_point = n_episodes//2
###########################
#Used for mountain car
###########################
threshold = 500
max = -threshold
optimal_episode = 0
#############################################################################
path = environment + "/" + folder + "/"+variant + "_" + str(gamma) + "_"+ str(alpha)+"_"+layers
if(Save):
if not os.path.exists(path):
os.makedirs(path)
if(Save):
actor.save_model(path + "/initial.h5")
#Store Total reward history and loss
score_history = []
loss = []
#Loop for the trajectories
for i in range(n_episodes):
done = False
score = 0
#Set initial state
observation = env.reset()
count = 0
while not done:
count +=1
#Choose action
action = actor.choose_action(observation)
#Perform action
observation_,reward, done, info = env.step(action)
#Store all the parameters for later use
actor.store_transition(observation, action, reward)
observation = observation_
score += reward
if(environment == "MountainCar"):
if(count==threshold):
break
score_history.append(score)
#save at mid point
if(i==save_point):
if(Save):
actor.save_model(path + "/middle.h5")
#Save Optimal score model
if(max<=score):
max=score
optimal_episode = i
if(Save):
actor.save_model(path +"/optimal.h5")
np.save(path+"/score",score_history)
np.save(path+"/score",score_history)
#Learning the policy
loss.append(actor.learn(variant))
print('episode ', i,'loss %.1f' % loss[i],'score %.1f' % score,
'average_score %.1f' % np.mean(score_history[-100:]))
if(Save):
actor.save_model(path + "/final.h5")
np.save(path+"/score",score_history)
plotLearning(environment,str(alpha), str(gamma), loss,score_history, path = path, label=variant, save =Save)
print(max,optimal_episode)
if Save:
#Loop through final and optimal
lists = ["final","optimal"]
for types in lists:
#weights path
weightPath = path + "/" + types +".h5"
#Generate Video
video_path = path +"/" + types + "/video"
if not os.path.exists(video_path):
os.makedirs(video_path)
#Load Weights
actor.policy.load_weights(weightPath)
print("--------Max Policy-------")
Generate_episode_max(env,actor,video_path)
print("--------Sampling Policy-------")
Generate_episode_sampling(env,actor,video_path)
#Get statistics
meanStd_path = path+"/" + types + "/meanstd"
#plot path
plot_path = path + "/" + types + "/plot"
if not os.path.exists(plot_path):
os.makedirs(plot_path)
episodes = 200
mean_std = []
print("----Max Policy----")
mean_std.append(policy_max(env, actor,variant, alpha, gamma, plot_path, ep=episodes))
print("----Sampling Policy----")
mean_std.append(policy_sampling(env, actor,variant, alpha, gamma, plot_path, ep=episodes))
print(mean_std)
np.save(meanStd_path,mean_std)