Skip to content

Commit a626029

Browse files
author
shixiaowen03
committed
AC,A2C,A3C
1 parent 68b5f63 commit a626029

File tree

11 files changed

+693
-272
lines changed

11 files changed

+693
-272
lines changed

.idea/workspace.xml

Lines changed: 191 additions & 91 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

RL/Basic-A2C-Demo/A2C.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import numpy as np
2+
import tensorflow as tf
3+
import gym
4+
5+
OUTPUT_GRAPH = False
6+
MAX_EPISODE = 3000
7+
DISPLAY_REWARD_THRESHOLD = 200 # renders environment if total episode reward is greater then this threshold
8+
MAX_EP_STEPS = 1000 # maximum time step in one episode
9+
RENDER = False # rendering wastes time
10+
GAMMA = 0.9 # reward discount in TD error
11+
LR_A = 0.001 # learning rate for actor
12+
LR_C = 0.01 # learning rate for critic
13+
14+
15+
16+
17+
class Actor(object):
18+
def __init__(self, sess, n_features, n_actions, lr=0.001):
19+
self.sess = sess
20+
21+
self.s = tf.placeholder(tf.float32, [1, n_features], "state")
22+
self.a = tf.placeholder(tf.int32, None, "action")
23+
self.td_error = tf.placeholder(tf.float32, None, "td_error") # TD_error
24+
25+
with tf.variable_scope('Actor'):
26+
l1 = tf.layers.dense(
27+
inputs=self.s,
28+
units=20, # number of hidden units
29+
activation=tf.nn.relu,
30+
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
31+
bias_initializer=tf.constant_initializer(0.1), # biases
32+
name='l1'
33+
)
34+
35+
self.acts_prob = tf.layers.dense(
36+
inputs=l1,
37+
units=n_actions, # output units
38+
activation=tf.nn.softmax, # get action probabilities
39+
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
40+
bias_initializer=tf.constant_initializer(0.1), # biases
41+
name='acts_prob'
42+
)
43+
44+
with tf.variable_scope('exp_v'):
45+
log_prob = tf.log(self.acts_prob[0, self.a])
46+
self.exp_v = tf.reduce_mean(log_prob * self.td_error) # advantage (TD_error) guided loss
47+
48+
with tf.variable_scope('train'):
49+
self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v) # minimize(-exp_v) = maximize(exp_v)
50+
51+
def learn(self, s, a, td):
52+
s = s[np.newaxis, :]
53+
feed_dict = {self.s: s, self.a: a, self.td_error: td}
54+
_, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
55+
return exp_v
56+
57+
def choose_action(self, s):
58+
s = s[np.newaxis, :]
59+
probs = self.sess.run(self.acts_prob, {self.s: s}) # get probabilities for all actions
60+
return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()) # return a int
61+
62+
63+
class Critic(object):
64+
def __init__(self, sess, n_features, lr=0.01):
65+
self.sess = sess
66+
67+
self.s = tf.placeholder(tf.float32, [1, n_features], "state")
68+
self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next")
69+
self.r = tf.placeholder(tf.float32, None, 'r')
70+
71+
with tf.variable_scope('Critic'):
72+
l1 = tf.layers.dense(
73+
inputs=self.s,
74+
units=20, # number of hidden units
75+
activation=tf.nn.relu, # None
76+
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
77+
bias_initializer=tf.constant_initializer(0.1), # biases
78+
name='l1'
79+
)
80+
81+
self.v = tf.layers.dense(
82+
inputs=l1,
83+
units=1, # output units
84+
activation=None,
85+
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
86+
bias_initializer=tf.constant_initializer(0.1), # biases
87+
name='V'
88+
)
89+
90+
with tf.variable_scope('squared_TD_error'):
91+
self.td_error = self.r + GAMMA * self.v_ - self.v
92+
self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval
93+
with tf.variable_scope('train'):
94+
self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
95+
96+
def learn(self, s, r, s_):
97+
s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
98+
99+
v_ = self.sess.run(self.v, {self.s: s_})
100+
td_error, _ = self.sess.run([self.td_error, self.train_op],
101+
{self.s: s, self.v_: v_, self.r: r})
102+
return td_error
103+
104+
# action有两个,即向左或向右移动小车
105+
# state是四维
106+
107+
env = gym.make('CartPole-v0')
108+
env.seed(1) # reproducible
109+
env = env.unwrapped
110+
111+
N_F = env.observation_space.shape[0]
112+
N_A = env.action_space.n
113+
114+
sess = tf.Session()
115+
116+
actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)
117+
critic = Critic(sess, n_features=N_F, lr=LR_C)
118+
119+
sess.run(tf.global_variables_initializer())
120+
121+
for i_episode in range(MAX_EPISODE):
122+
s = env.reset()
123+
t = 0
124+
track_r = []
125+
while True:
126+
if RENDER: env.render()
127+
128+
a = actor.choose_action(s)
129+
130+
s_, r, done, info = env.step(a)
131+
132+
if done: r = -20
133+
134+
track_r.append(r)
135+
136+
td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)]
137+
actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error]
138+
139+
s = s_
140+
t += 1
141+
142+
if done or t >= MAX_EP_STEPS:
143+
ep_rs_sum = sum(track_r)
144+
145+
if 'running_reward' not in globals():
146+
running_reward = ep_rs_sum
147+
else:
148+
running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
149+
if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
150+
print("episode:", i_episode, " reward:", int(running_reward))
151+
break
152+

RL/Basic-A3C-Demo/A3C.py

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
import multiprocessing
2+
import threading
3+
import tensorflow as tf
4+
import numpy as np
5+
import gym
6+
import os
7+
import shutil
8+
import matplotlib.pyplot as plt
9+
10+
11+
GAME = 'CartPole-v0'
12+
OUTPUT_GRAPH = True
13+
LOG_DIR = './log'
14+
N_WORKERS = multiprocessing.cpu_count()
15+
MAX_GLOBAL_EP = 1000
16+
GLOBAL_NET_SCOPE = 'Global_Net'
17+
UPDATE_GLOBAL_ITER = 10
18+
GAMMA = 0.9
19+
ENTROPY_BETA = 0.001
20+
LR_A = 0.001 # learning rate for actor
21+
LR_C = 0.001 # learning rate for critic
22+
GLOBAL_RUNNING_R = []
23+
GLOBAL_EP = 0
24+
25+
env = gym.make(GAME)
26+
N_S = env.observation_space.shape[0]
27+
N_A = env.action_space.n
28+
29+
30+
class ACNet(object):
31+
def __init__(self, scope, globalAC=None):
32+
33+
if scope == GLOBAL_NET_SCOPE: # get global network
34+
with tf.variable_scope(scope):
35+
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
36+
self.a_params, self.c_params = self._build_net(scope)[-2:]
37+
else: # local net, calculate losses
38+
with tf.variable_scope(scope):
39+
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
40+
self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
41+
self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
42+
43+
self.a_prob, self.v, self.a_params, self.c_params = self._build_net(scope)
44+
45+
td = tf.subtract(self.v_target, self.v, name='TD_error')
46+
with tf.name_scope('c_loss'):
47+
self.c_loss = tf.reduce_mean(tf.square(td)) # critic的loss是平方loss
48+
49+
with tf.name_scope('a_loss'):
50+
# Q * log(
51+
log_prob = tf.reduce_sum(tf.log(self.a_prob + 1e-5) *
52+
tf.one_hot(self.a_his, N_A, dtype=tf.float32),
53+
axis=1, keep_dims=True)
54+
exp_v = log_prob * tf.stop_gradient(td) # 这里的td不再求导,当作是常数
55+
entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob + 1e-5),
56+
axis=1, keep_dims=True) # encourage exploration
57+
self.exp_v = ENTROPY_BETA * entropy + exp_v
58+
self.a_loss = tf.reduce_mean(-self.exp_v)
59+
60+
with tf.name_scope('local_grad'):
61+
self.a_grads = tf.gradients(self.a_loss, self.a_params)
62+
self.c_grads = tf.gradients(self.c_loss, self.c_params)
63+
64+
with tf.name_scope('sync'):
65+
with tf.name_scope('pull'): # 把主网络的参数赋予各子网络
66+
self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
67+
self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
68+
with tf.name_scope('push'): # 使用子网络的梯度对主网络参数进行更新
69+
self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
70+
self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
71+
72+
def _build_net(self, scope):
73+
w_init = tf.random_normal_initializer(0., .1)
74+
with tf.variable_scope('actor'):
75+
l_a = tf.layers.dense(self.s, 200, tf.nn.relu6, kernel_initializer=w_init, name='la')
76+
a_prob = tf.layers.dense(l_a, N_A, tf.nn.softmax, kernel_initializer=w_init, name='ap') # 得到每个动作的选择概率
77+
with tf.variable_scope('critic'):
78+
l_c = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_init, name='lc')
79+
v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # 得到每个状态的价值函数
80+
a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
81+
c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
82+
return a_prob, v, a_params, c_params
83+
84+
def update_global(self, feed_dict): # run by a local
85+
SESS.run([self.update_a_op, self.update_c_op], feed_dict) # local grads applies to global net
86+
87+
def pull_global(self): # run by a local
88+
SESS.run([self.pull_a_params_op, self.pull_c_params_op])
89+
90+
def choose_action(self, s): # run by a local
91+
prob_weights = SESS.run(self.a_prob, feed_dict={self.s: s[np.newaxis, :]})
92+
action = np.random.choice(range(prob_weights.shape[1]),
93+
p=prob_weights.ravel()) # select action w.r.t the actions prob
94+
return action
95+
96+
97+
class Worker(object):
98+
def __init__(self, name, globalAC):
99+
self.env = gym.make(GAME).unwrapped
100+
self.name = name
101+
self.AC = ACNet(name, globalAC)
102+
103+
def work(self):
104+
global GLOBAL_RUNNING_R, GLOBAL_EP
105+
total_step = 1
106+
buffer_s, buffer_a, buffer_r = [], [], []
107+
while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
108+
s = self.env.reset()
109+
ep_r = 0
110+
while True:
111+
a = self.AC.choose_action(s)
112+
s_, r, done, info = self.env.step(a)
113+
if done: r = -5
114+
ep_r += r
115+
buffer_s.append(s)
116+
buffer_a.append(a)
117+
buffer_r.append(r)
118+
119+
if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net
120+
if done:
121+
v_s_ = 0 # terminal
122+
else:
123+
v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
124+
buffer_v_target = []
125+
for r in buffer_r[::-1]: # reverse buffer r
126+
v_s_ = r + GAMMA * v_s_ # 使用v(s) = r + v(s+1)计算target_v
127+
buffer_v_target.append(v_s_)
128+
buffer_v_target.reverse()
129+
130+
buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack(buffer_v_target)
131+
feed_dict = {
132+
self.AC.s: buffer_s,
133+
self.AC.a_his: buffer_a,
134+
self.AC.v_target: buffer_v_target,
135+
}
136+
self.AC.update_global(feed_dict)
137+
138+
buffer_s, buffer_a, buffer_r = [], [], []
139+
self.AC.pull_global()
140+
141+
s = s_
142+
total_step += 1
143+
if done:
144+
if len(GLOBAL_RUNNING_R) == 0: # record running episode reward
145+
GLOBAL_RUNNING_R.append(ep_r)
146+
else:
147+
GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r)
148+
print(
149+
self.name,
150+
"Ep:", GLOBAL_EP,
151+
"| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
152+
)
153+
GLOBAL_EP += 1
154+
break
155+
156+
if __name__ == "__main__":
157+
SESS = tf.Session()
158+
159+
with tf.device("/cpu:0"):
160+
OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
161+
OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
162+
GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params
163+
workers = []
164+
# Create worker
165+
for i in range(N_WORKERS):
166+
i_name = 'W_%i' % i # worker name
167+
workers.append(Worker(i_name, GLOBAL_AC))
168+
169+
# Coordinator类用来管理在Session中的多个线程,
170+
# 使用 tf.train.Coordinator()来创建一个线程管理器(协调器)对象。
171+
COORD = tf.train.Coordinator()
172+
SESS.run(tf.global_variables_initializer())
173+
174+
if OUTPUT_GRAPH:
175+
if os.path.exists(LOG_DIR):
176+
shutil.rmtree(LOG_DIR)
177+
tf.summary.FileWriter(LOG_DIR, SESS.graph)
178+
179+
worker_threads = []
180+
for worker in workers:
181+
job = lambda: worker.work()
182+
t = threading.Thread(target=job) # 创建一个线程,并分配其工作
183+
t.start() # 开启线程
184+
worker_threads.append(t)
185+
COORD.join(worker_threads) #把开启的线程加入主线程,等待threads结束
186+
187+
plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
188+
plt.xlabel('step')
189+
plt.ylabel('Total moving reward')
190+
plt.show()
Binary file not shown.

0 commit comments

Comments
 (0)