dongdongyu
diff --git a/‎.idea/workspace.xml‎
Lines changed: 191 additions & 91 deletions b/‎.idea/workspace.xml‎
Lines changed: 191 additions & 91 deletions
diff --git a/‎RL/Basic-A2C-Demo/A2C.py‎
Lines changed: 152 additions & 0 deletions b/‎RL/Basic-A2C-Demo/A2C.py‎
Lines changed: 152 additions & 0 deletions
diff --git a/‎RL/Basic-A3C-Demo/A3C.py‎
Lines changed: 190 additions & 0 deletions b/‎RL/Basic-A3C-Demo/A3C.py‎
Lines changed: 190 additions & 0 deletions
diff --git a/‎RL/Basic-A3C-Demo/log/events.out.tfevents.1543473204.meituan-sxwdeMacBook-Pro-4.local‎
1.01 MB b/‎RL/Basic-A3C-Demo/log/events.out.tfevents.1543473204.meituan-sxwdeMacBook-Pro-4.local‎
1.01 MB
@@ -0,0 +1,152 @@
+import numpy as np
+import tensorflow as tf
+import gym
+
+OUTPUT_GRAPH = False
+MAX_EPISODE = 3000
+DISPLAY_REWARD_THRESHOLD = 200  # renders environment if total episode reward is greater then this threshold
+MAX_EP_STEPS = 1000   # maximum time step in one episode
+RENDER = False  # rendering wastes time
+GAMMA = 0.9     # reward discount in TD error
+LR_A = 0.001    # learning rate for actor
+LR_C = 0.01     # learning rate for critic
+
+
+
+
+class Actor(object):
+    def __init__(self, sess, n_features, n_actions, lr=0.001):
+        self.sess = sess
+
+        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
+        self.a = tf.placeholder(tf.int32, None, "action")
+        self.td_error = tf.placeholder(tf.float32, None, "td_error")  # TD_error
+
+        with tf.variable_scope('Actor'):
+            l1 = tf.layers.dense(
+                inputs=self.s,
+                units=20,    # number of hidden units
+                activation=tf.nn.relu,
+                kernel_initializer=tf.random_normal_initializer(0., .1),    # weights
+                bias_initializer=tf.constant_initializer(0.1),  # biases
+                name='l1'
+            )
+
+            self.acts_prob = tf.layers.dense(
+                inputs=l1,
+                units=n_actions,    # output units
+                activation=tf.nn.softmax,   # get action probabilities
+                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
+                bias_initializer=tf.constant_initializer(0.1),  # biases
+                name='acts_prob'
+            )
+
+        with tf.variable_scope('exp_v'):
+            log_prob = tf.log(self.acts_prob[0, self.a])
+            self.exp_v = tf.reduce_mean(log_prob * self.td_error)  # advantage (TD_error) guided loss
+
+        with tf.variable_scope('train'):
+            self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v)  # minimize(-exp_v) = maximize(exp_v)
+
+    def learn(self, s, a, td):
+        s = s[np.newaxis, :]
+        feed_dict = {self.s: s, self.a: a, self.td_error: td}
+        _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
+        return exp_v
+
+    def choose_action(self, s):
+        s = s[np.newaxis, :]
+        probs = self.sess.run(self.acts_prob, {self.s: s})   # get probabilities for all actions
+        return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel())   # return a int
+
+
+class Critic(object):
+    def __init__(self, sess, n_features, lr=0.01):
+        self.sess = sess
+
+        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
+        self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next")
+        self.r = tf.placeholder(tf.float32, None, 'r')
+
+        with tf.variable_scope('Critic'):
+            l1 = tf.layers.dense(
+                inputs=self.s,
+                units=20,  # number of hidden units
+                activation=tf.nn.relu,  # None
+                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
+                bias_initializer=tf.constant_initializer(0.1),  # biases
+                name='l1'
+            )
+
+            self.v = tf.layers.dense(
+                inputs=l1,
+                units=1,  # output units
+                activation=None,
+                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
+                bias_initializer=tf.constant_initializer(0.1),  # biases
+                name='V'
+            )
+
+        with tf.variable_scope('squared_TD_error'):
+            self.td_error = self.r + GAMMA * self.v_ - self.v
+            self.loss = tf.square(self.td_error)    # TD_error = (r+gamma*V_next) - V_eval
+        with tf.variable_scope('train'):
+            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
+
+    def learn(self, s, r, s_):
+        s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
+
+        v_ = self.sess.run(self.v, {self.s: s_})
+        td_error, _ = self.sess.run([self.td_error, self.train_op],
+                                          {self.s: s, self.v_: v_, self.r: r})
+        return td_error
+    
+# action有两个，即向左或向右移动小车
+# state是四维
+
+env = gym.make('CartPole-v0')
+env.seed(1)  # reproducible
+env = env.unwrapped
+
+N_F = env.observation_space.shape[0]
+N_A = env.action_space.n
+
+sess = tf.Session()
+
+actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)
+critic = Critic(sess, n_features=N_F, lr=LR_C)
+
+sess.run(tf.global_variables_initializer())
+
+for i_episode in range(MAX_EPISODE):
+    s = env.reset()
+    t = 0
+    track_r = []
+    while True:
+        if RENDER: env.render()
+
+        a = actor.choose_action(s)
+
+        s_, r, done, info = env.step(a)
+
+        if done: r = -20
+
+        track_r.append(r)
+
+        td_error = critic.learn(s, r, s_)  # gradient = grad[r + gamma * V(s_) - V(s)]
+        actor.learn(s, a, td_error)     # true_gradient = grad[logPi(s,a) * td_error]
+
+        s = s_
+        t += 1
+
+        if done or t >= MAX_EP_STEPS:
+            ep_rs_sum = sum(track_r)
+
+            if 'running_reward' not in globals():
+                running_reward = ep_rs_sum
+            else:
+                running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
+            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
+            print("episode:", i_episode, "  reward:", int(running_reward))
+            break
+
@@ -0,0 +1,190 @@
+import multiprocessing
+import threading
+import tensorflow as tf
+import numpy as np
+import gym
+import os
+import shutil
+import matplotlib.pyplot as plt
+
+
+GAME = 'CartPole-v0'
+OUTPUT_GRAPH = True
+LOG_DIR = './log'
+N_WORKERS = multiprocessing.cpu_count()
+MAX_GLOBAL_EP = 1000
+GLOBAL_NET_SCOPE = 'Global_Net'
+UPDATE_GLOBAL_ITER = 10
+GAMMA = 0.9
+ENTROPY_BETA = 0.001
+LR_A = 0.001    # learning rate for actor
+LR_C = 0.001    # learning rate for critic
+GLOBAL_RUNNING_R = []
+GLOBAL_EP = 0
+
+env = gym.make(GAME)
+N_S = env.observation_space.shape[0]
+N_A = env.action_space.n
+
+
+class ACNet(object):
+    def __init__(self, scope, globalAC=None):
+
+        if scope == GLOBAL_NET_SCOPE:   # get global network
+            with tf.variable_scope(scope):
+                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+                self.a_params, self.c_params = self._build_net(scope)[-2:]
+        else:   # local net, calculate losses
+            with tf.variable_scope(scope):
+                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
+                self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
+                self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
+
+                self.a_prob, self.v, self.a_params, self.c_params = self._build_net(scope)
+
+                td = tf.subtract(self.v_target, self.v, name='TD_error')
+                with tf.name_scope('c_loss'):
+                    self.c_loss = tf.reduce_mean(tf.square(td)) # critic的loss是平方loss
+
+                with tf.name_scope('a_loss'):
+                    # Q * log（
+                    log_prob = tf.reduce_sum(tf.log(self.a_prob + 1e-5) *
+                                             tf.one_hot(self.a_his, N_A, dtype=tf.float32),
+                                             axis=1, keep_dims=True)
+                    exp_v = log_prob * tf.stop_gradient(td) # 这里的td不再求导，当作是常数
+                    entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob + 1e-5),
+                                             axis=1, keep_dims=True)  # encourage exploration
+                    self.exp_v = ENTROPY_BETA * entropy + exp_v
+                    self.a_loss = tf.reduce_mean(-self.exp_v)
+
+                with tf.name_scope('local_grad'):
+                    self.a_grads = tf.gradients(self.a_loss, self.a_params)
+                    self.c_grads = tf.gradients(self.c_loss, self.c_params)
+
+            with tf.name_scope('sync'):
+                with tf.name_scope('pull'): # 把主网络的参数赋予各子网络
+                    self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
+                    self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
+                with tf.name_scope('push'): # 使用子网络的梯度对主网络参数进行更新
+                    self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
+                    self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
+
+    def _build_net(self, scope):
+        w_init = tf.random_normal_initializer(0., .1)
+        with tf.variable_scope('actor'):
+            l_a = tf.layers.dense(self.s, 200, tf.nn.relu6, kernel_initializer=w_init, name='la')
+            a_prob = tf.layers.dense(l_a, N_A, tf.nn.softmax, kernel_initializer=w_init, name='ap') # 得到每个动作的选择概率
+        with tf.variable_scope('critic'):
+            l_c = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_init, name='lc')
+            v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v')  # 得到每个状态的价值函数
+        a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
+        c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
+        return a_prob, v, a_params, c_params
+
+    def update_global(self, feed_dict):  # run by a local
+        SESS.run([self.update_a_op, self.update_c_op], feed_dict)  # local grads applies to global net
+
+    def pull_global(self):  # run by a local
+        SESS.run([self.pull_a_params_op, self.pull_c_params_op])
+
+    def choose_action(self, s):  # run by a local
+        prob_weights = SESS.run(self.a_prob, feed_dict={self.s: s[np.newaxis, :]})
+        action = np.random.choice(range(prob_weights.shape[1]),
+                                  p=prob_weights.ravel())  # select action w.r.t the actions prob
+        return action
+
+
+class Worker(object):
+    def __init__(self, name, globalAC):
+        self.env = gym.make(GAME).unwrapped
+        self.name = name
+        self.AC = ACNet(name, globalAC)
+
+    def work(self):
+        global GLOBAL_RUNNING_R, GLOBAL_EP
+        total_step = 1
+        buffer_s, buffer_a, buffer_r = [], [], []
+        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
+            s = self.env.reset()
+            ep_r = 0
+            while True:
+                a = self.AC.choose_action(s)
+                s_, r, done, info = self.env.step(a)
+                if done: r = -5
+                ep_r += r
+                buffer_s.append(s)
+                buffer_a.append(a)
+                buffer_r.append(r)
+
+                if total_step % UPDATE_GLOBAL_ITER == 0 or done:   # update global and assign to local net
+                    if done:
+                        v_s_ = 0   # terminal
+                    else:
+                        v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
+                    buffer_v_target = []
+                    for r in buffer_r[::-1]:    # reverse buffer r
+                        v_s_ = r + GAMMA * v_s_ # 使用v(s) = r + v(s+1)计算target_v
+                        buffer_v_target.append(v_s_)
+                    buffer_v_target.reverse()
+
+                    buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack(buffer_v_target)
+                    feed_dict = {
+                        self.AC.s: buffer_s,
+                        self.AC.a_his: buffer_a,
+                        self.AC.v_target: buffer_v_target,
+                    }
+                    self.AC.update_global(feed_dict)
+
+                    buffer_s, buffer_a, buffer_r = [], [], []
+                    self.AC.pull_global()
+
+                s = s_
+                total_step += 1
+                if done:
+                    if len(GLOBAL_RUNNING_R) == 0:  # record running episode reward
+                        GLOBAL_RUNNING_R.append(ep_r)
+                    else:
+                        GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r)
+                    print(
+                        self.name,
+                        "Ep:", GLOBAL_EP,
+                        "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
+                          )
+                    GLOBAL_EP += 1
+                    break
+
+if __name__ == "__main__":
+    SESS = tf.Session()
+
+    with tf.device("/cpu:0"):
+        OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
+        OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
+        GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE)  # we only need its params
+        workers = []
+        # Create worker
+        for i in range(N_WORKERS):
+            i_name = 'W_%i' % i   # worker name
+            workers.append(Worker(i_name, GLOBAL_AC))
+
+    # Coordinator类用来管理在Session中的多个线程，
+    # 使用 tf.train.Coordinator()来创建一个线程管理器（协调器）对象。
+    COORD = tf.train.Coordinator()
+    SESS.run(tf.global_variables_initializer())
+
+    if OUTPUT_GRAPH:
+        if os.path.exists(LOG_DIR):
+            shutil.rmtree(LOG_DIR)
+        tf.summary.FileWriter(LOG_DIR, SESS.graph)
+
+    worker_threads = []
+    for worker in workers:
+        job = lambda: worker.work()
+        t = threading.Thread(target=job) # 创建一个线程，并分配其工作
+        t.start() # 开启线程
+        worker_threads.append(t)
+    COORD.join(worker_threads) #把开启的线程加入主线程，等待threads结束
+
+    plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
+    plt.xlabel('step')
+    plt.ylabel('Total moving reward')
+    plt.show()