-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeep_monkey.py
141 lines (115 loc) · 5.01 KB
/
deep_monkey.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
import time
import torch
def train_agent(agent, environment, n_episodes=800, max_t=1000,
eps_start=1.0, eps_end=0.01, eps_decay=0.995,
stop_avg_score=13, score_window_size=100):
"""
:param agent: agent to be trained
:param environment: unity environment in which the agent should be trained
:param n_episodes: maximum number of training episodes
:param max_t: maximum timesteps per episode
:param eps_start: starting value for epsilon
:param eps_end: minimum value for epsilon
:param eps_decay: multiplicative decay factor for epsilon used for exponential decay
:param stop_avg_score: score at which the task is considered solved
:param score_window_size: size of the moving average window for smoothing agent scores
:return: trained agent
"""
scores = []
scores_window = deque(maxlen=score_window_size)
brain_name = environment.brain_names[0]
eps = eps_start
for i_episode in range(1, n_episodes+1):
env_info = environment.reset(train_mode=True)[brain_name]
state = env_info.vector_observations[0]
score = 0
for t in range(max_t):
action = agent.act(state, eps)
env_info = environment.step(action)
env_info = env_info[brain_name]
next_state = env_info.vector_observations[0]
reward = env_info.rewards[0]
done = env_info.local_done[0]
agent.step(state, action, reward, next_state, done)
score += reward
state = next_state
if done:
break
scores_window.append(score)
scores.append(score)
eps = max(eps_end, eps_decay* eps)
print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
if i_episode % 100 == 0:
print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
if np.mean(scores_window) >= stop_avg_score:
solution_episodes_n = max(i_episode - 100, 0)
print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
.format(solution_episodes_n, np.mean(scores_window)))
break
return agent, solution_episodes_n, np.mean(scores_window), scores
def demo_agent(agent, env):
"""
Demonstrates the agent in the environment for one episode.
:param agent:
:param env:
:return:
"""
brain_name = env.brain_names[0]
env_info = env.reset(train_mode=False)[brain_name] # reset the environment
state = env_info.vector_observations[0] # get the current state
score = 0 # initialize the score
while True:
action = agent.act(state, eps=0) # select an action
env_info = env.step(action)[brain_name] # send the action to the environment
next_state = env_info.vector_observations[0] # get the next state
reward = env_info.rewards[0] # get the reward
done = env_info.local_done[0] # see if episode has finished
score += reward # update the score
state = next_state # roll over the state to next time step
if done: # exit loop if episode finished
break
print("Score: {}".format(score))
return score
def save_checkpoint(agent, file_name_tag):
"""
Saves agents internal qnetwork weights to file in a file whose name includes the following information
:param agent: agent whose weights should be saved
:param file_name_tag: unique tag for the checkpoint file name
:return checkpoint_filename: filename of the checkpoint file just saved
"""
checkpoint_filename = 'checkpoints/checkpoint_' + file_name_tag + '.pth'
torch.save(agent.qnetwork_local.state_dict(), checkpoint_filename)
return checkpoint_filename
def generate_filename_tag(agent, avg_score, episodes_n):
'''
Returns the filename with standard formatting. Encodes in a string the following information:
:param agent: agent to be encoded, used to extract type
:param avg_score: average score at the end of training
:param episodes_n: number of episodes that the agent took to solve the task
:return: string with the encoded information passed as parameters
'''
file_name_tag = '{type}_[{fc1s},{fc2s}]_{score:.2f}_{episodes}_{tstamp}' \
.format(type=agent.__class__.__name__,
episodes=episodes_n,
fc1s=agent.fc1_size,
fc2s=agent.fc2_size,
score=avg_score,
tstamp=time.strftime("%Y%m%d-%H%M%S"))
return file_name_tag
def plot_score(scores):
"""
Plot and show the average scores against the episodes number
:param scores: array including the values of the average scores to be plotted
:return: plot figure handle
"""
# plot the scores
fig = plt.figure()
fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()
return fig