dongdongyu
diff --git a/‎.idea/misc.xml‎
Lines changed: 1 addition & 1 deletion b/‎.idea/misc.xml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.idea/tensorflow1.2.iml‎
Lines changed: 1 addition & 1 deletion b/‎.idea/tensorflow1.2.iml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.idea/workspace.xml‎
Lines changed: 81 additions & 60 deletions b/‎.idea/workspace.xml‎
Lines changed: 81 additions & 60 deletions
diff --git a/‎RL/Basic-A2C-Demo/A2C.py‎
Lines changed: 8 additions & 3 deletions b/‎RL/Basic-A2C-Demo/A2C.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎RL/Basic-A3C-Demo/A3C.py‎
Lines changed: 5 additions & 1 deletion b/‎RL/Basic-A3C-Demo/A3C.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎…3473204.meituan-sxwdeMacBook-Pro-4.local‎ ‎…3484083.meituan-sxwdeMacBook-Pro-4.local‎RL/Basic-A3C-Demo/log/events.out.tfevents.1543473204.meituan-sxwdeMacBook-Pro-4.local renamed to RL/Basic-A3C-Demo/log/events.out.tfevents.1543484083.meituan-sxwdeMacBook-Pro-4.local
1.01 MB b/‎…3473204.meituan-sxwdeMacBook-Pro-4.local‎ ‎…3484083.meituan-sxwdeMacBook-Pro-4.local‎RL/Basic-A3C-Demo/log/events.out.tfevents.1543473204.meituan-sxwdeMacBook-Pro-4.local renamed to RL/Basic-A3C-Demo/log/events.out.tfevents.1543484083.meituan-sxwdeMacBook-Pro-4.local
1.01 MB
diff --git a/‎RL/Basic-AC-Demo/AC.py‎
Lines changed: 8 additions & 2 deletions b/‎RL/Basic-AC-Demo/AC.py‎
Lines changed: 8 additions & 2 deletions
@@ -1,11 +1,12 @@
 import numpy as np
 import tensorflow as tf
 import gym
+import pandas as pd
 
 OUTPUT_GRAPH = False
-MAX_EPISODE = 3000
+MAX_EPISODE = 500
 DISPLAY_REWARD_THRESHOLD = 200  # renders environment if total episode reward is greater then this threshold
-MAX_EP_STEPS = 1000   # maximum time step in one episode
+MAX_EP_STEPS = 2000   # maximum time step in one episode
 RENDER = False  # rendering wastes time
 GAMMA = 0.9     # reward discount in TD error
 LR_A = 0.001    # learning rate for actor
@@ -100,7 +101,7 @@ def learn(self, s, r, s_):
         td_error, _ = self.sess.run([self.td_error, self.train_op],
                                           {self.s: s, self.v_: v_, self.r: r})
         return td_error
-    
+
 # action有两个，即向左或向右移动小车
 # state是四维
 
@@ -118,6 +119,7 @@ def learn(self, s, r, s_):
 
 sess.run(tf.global_variables_initializer())
 
+res = []
 for i_episode in range(MAX_EPISODE):
     s = env.reset()
     t = 0
@@ -148,5 +150,8 @@ def learn(self, s, r, s_):
                 running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
             if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
             print("episode:", i_episode, "  reward:", int(running_reward))
+            res.append([i_episode, running_reward])
             break
 
+pd.DataFrame(res,columns=['episode','a2c_reward']).to_csv('../a2c_reward.csv')
+
@@ -6,13 +6,14 @@
 import os
 import shutil
 import matplotlib.pyplot as plt
+import pandas as pd
 
 
 GAME = 'CartPole-v0'
 OUTPUT_GRAPH = True
 LOG_DIR = './log'
 N_WORKERS = multiprocessing.cpu_count()
-MAX_GLOBAL_EP = 1000
+MAX_GLOBAL_EP = 500
 GLOBAL_NET_SCOPE = 'Global_Net'
 UPDATE_GLOBAL_ITER = 10
 GAMMA = 0.9
@@ -184,6 +185,9 @@ def work(self):
         worker_threads.append(t)
     COORD.join(worker_threads) #把开启的线程加入主线程，等待threads结束
 
+    res = np.concatenate([np.arange(len(GLOBAL_RUNNING_R)).reshape(-1,1),np.array(GLOBAL_RUNNING_R).reshape(-1,1)],axis=1)
+    pd.DataFrame(res, columns=['episode', 'a3c_reward']).to_csv('../a3c_reward.csv')
+
     plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
     plt.xlabel('step')
     plt.ylabel('Total moving reward')
 
@@ -1,11 +1,12 @@
 import numpy as np
 import tensorflow as tf
 import gym
+import pandas as pd
 
 OUTPUT_GRAPH = False
-MAX_EPISODE = 3000
+MAX_EPISODE = 500
 DISPLAY_REWARD_THRESHOLD = 200  # renders environment if total episode reward is greater then this threshold
-MAX_EP_STEPS = 1000  # maximum time step in one episode
+MAX_EP_STEPS = 2000  # maximum time step in one episode
 RENDER = False  # rendering wastes time
 GAMMA = 0.9  # reward discount in TD error
 LR_A = 0.001  # learning rate for actor
@@ -126,6 +127,7 @@ def learn(self, s, a, r, s_):
 
 sess.run(tf.global_variables_initializer())
 
+res = []
 for i_episode in range(MAX_EPISODE):
     s = env.reset()
     t = 0
@@ -156,5 +158,9 @@ def learn(self, s, a, r, s_):
                 running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
             if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
             print("episode:", i_episode, "  reward:", int(running_reward))
+            res.append([i_episode,running_reward])
+
             break
 
+pd.DataFrame(res,columns=['episode','ac_reward']).to_csv('../ac_reward.csv')
+