Skip to content

Commit e21e1c0

Browse files
author
shixiaowen03
committed
gail
1 parent 9937c07 commit e21e1c0

File tree

12 files changed

+8798
-202
lines changed

12 files changed

+8798
-202
lines changed

.idea/workspace.xml

Lines changed: 172 additions & 202 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

RL/Basic-GAIL-Demo/algo/ppo.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import tensorflow as tf
2+
import copy
3+
4+
5+
class PPOTrain:
6+
def __init__(self, Policy, Old_Policy, gamma=0.95, clip_value=0.2, c_1=1, c_2=0.01):
7+
"""
8+
:param Policy:
9+
:param Old_Policy:
10+
:param gamma:
11+
:param clip_value:
12+
:param c_1: parameter for value difference
13+
:param c_2: parameter for entropy bonus
14+
"""
15+
16+
self.Policy = Policy
17+
self.Old_Policy = Old_Policy
18+
self.gamma = gamma
19+
20+
pi_trainable = self.Policy.get_trainable_variables()
21+
old_pi_trainable = self.Old_Policy.get_trainable_variables()
22+
23+
# assign_operations for policy parameter values to old policy parameters
24+
with tf.variable_scope('assign_op'):
25+
self.assign_ops = []
26+
for v_old, v in zip(old_pi_trainable, pi_trainable):
27+
self.assign_ops.append(tf.assign(v_old, v))
28+
29+
# inputs for train_op
30+
with tf.variable_scope('train_inp'):
31+
self.actions = tf.placeholder(dtype=tf.int32, shape=[None], name='actions')
32+
self.rewards = tf.placeholder(dtype=tf.float32, shape=[None], name='rewards')
33+
self.v_preds_next = tf.placeholder(dtype=tf.float32, shape=[None], name='v_preds_next')
34+
self.gaes = tf.placeholder(dtype=tf.float32, shape=[None], name='gaes')
35+
36+
act_probs = self.Policy.act_probs
37+
act_probs_old = self.Old_Policy.act_probs
38+
39+
# probabilities of actions which agent took with policy
40+
act_probs = act_probs * tf.one_hot(indices=self.actions, depth=act_probs.shape[1])
41+
act_probs = tf.reduce_sum(act_probs, axis=1)
42+
43+
# probabilities of actions which agent took with old policy
44+
act_probs_old = act_probs_old * tf.one_hot(indices=self.actions, depth=act_probs_old.shape[1])
45+
act_probs_old = tf.reduce_sum(act_probs_old, axis=1)
46+
47+
with tf.variable_scope('loss'):
48+
# construct computation graph for loss_clip
49+
# ratios = tf.divide(act_probs, act_probs_old)
50+
ratios = tf.exp(tf.log(tf.clip_by_value(act_probs, 1e-10, 1.0))
51+
- tf.log(tf.clip_by_value(act_probs_old, 1e-10, 1.0)))
52+
clipped_ratios = tf.clip_by_value(ratios, clip_value_min=1 - clip_value, clip_value_max=1 + clip_value)
53+
loss_clip = tf.minimum(tf.multiply(self.gaes, ratios), tf.multiply(self.gaes, clipped_ratios))
54+
loss_clip = tf.reduce_mean(loss_clip)
55+
tf.summary.scalar('loss_clip', loss_clip)
56+
57+
# construct computation graph for loss of entropy bonus
58+
entropy = -tf.reduce_sum(self.Policy.act_probs *
59+
tf.log(tf.clip_by_value(self.Policy.act_probs, 1e-10, 1.0)), axis=1)
60+
entropy = tf.reduce_mean(entropy, axis=0) # mean of entropy of pi(obs)
61+
tf.summary.scalar('entropy', entropy)
62+
63+
# construct computation graph for loss of value function
64+
v_preds = self.Policy.v_preds
65+
loss_vf = tf.squared_difference(self.rewards + self.gamma * self.v_preds_next, v_preds)
66+
loss_vf = tf.reduce_mean(loss_vf)
67+
tf.summary.scalar('value_difference', loss_vf)
68+
69+
# construct computation graph for loss
70+
loss = loss_clip - c_1 * loss_vf + c_2 * entropy
71+
72+
# minimize -loss == maximize loss
73+
loss = -loss
74+
tf.summary.scalar('total', loss)
75+
76+
self.merged = tf.summary.merge_all()
77+
optimizer = tf.train.AdamOptimizer(learning_rate=5e-5, epsilon=1e-5)
78+
self.gradients = optimizer.compute_gradients(loss, var_list=pi_trainable)
79+
self.train_op = optimizer.minimize(loss, var_list=pi_trainable)
80+
81+
def train(self, obs, actions, gaes, rewards, v_preds_next):
82+
tf.get_default_session().run(self.train_op, feed_dict={self.Policy.obs: obs,
83+
self.Old_Policy.obs: obs,
84+
self.actions: actions,
85+
self.rewards: rewards,
86+
self.v_preds_next: v_preds_next,
87+
self.gaes: gaes})
88+
89+
def get_summary(self, obs, actions, gaes, rewards, v_preds_next):
90+
return tf.get_default_session().run(self.merged, feed_dict={self.Policy.obs: obs,
91+
self.Old_Policy.obs: obs,
92+
self.actions: actions,
93+
self.rewards: rewards,
94+
self.v_preds_next: v_preds_next,
95+
self.gaes: gaes})
96+
97+
def assign_policy_parameters(self):
98+
# assign policy parameter values to old policy parameters
99+
return tf.get_default_session().run(self.assign_ops)
100+
101+
def get_gaes(self, rewards, v_preds, v_preds_next):
102+
deltas = [r_t + self.gamma * v_next - v for r_t, v_next, v in zip(rewards, v_preds_next, v_preds)]
103+
# calculate generative advantage estimator(lambda = 1), see ppo paper eq(11)
104+
gaes = copy.deepcopy(deltas)
105+
for t in reversed(range(len(gaes) - 1)): # is T-1, where T is time step which run policy
106+
gaes[t] = gaes[t] + self.gamma * gaes[t + 1]
107+
return gaes
108+
109+
def get_grad(self, obs, actions, gaes, rewards, v_preds_next):
110+
return tf.get_default_session().run(self.gradients, feed_dict={self.Policy.obs: obs,
111+
self.Old_Policy.obs: obs,
112+
self.actions: actions,
113+
self.rewards: rewards,
114+
self.v_preds_next: v_preds_next,
115+
self.gaes: gaes})
Binary file not shown.
Binary file not shown.
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import tensorflow as tf
2+
3+
class Discriminator:
4+
def __init__(self, env):
5+
"""
6+
:param env:
7+
Output of this Discriminator is reward for learning agent. Not the cost.
8+
Because discriminator predicts P(expert|s,a) = 1 - P(agent|s,a).
9+
"""
10+
11+
with tf.variable_scope('discriminator'):
12+
self.scope = tf.get_variable_scope().name
13+
self.expert_s = tf.placeholder(dtype=tf.float32, shape=[None] + list(env.observation_space.shape))
14+
self.expert_a = tf.placeholder(dtype=tf.int32, shape=[None])
15+
expert_a_one_hot = tf.one_hot(self.expert_a, depth=env.action_space.n)
16+
# add noise for stabilise training
17+
expert_a_one_hot += tf.random_normal(tf.shape(expert_a_one_hot), mean=0.2, stddev=0.1, dtype=tf.float32)/1.2
18+
expert_s_a = tf.concat([self.expert_s, expert_a_one_hot], axis=1) # 将专家的state和action进行拼接
19+
20+
self.agent_s = tf.placeholder(dtype=tf.float32, shape=[None] + list(env.observation_space.shape))
21+
self.agent_a = tf.placeholder(dtype=tf.int32, shape=[None])
22+
agent_a_one_hot = tf.one_hot(self.agent_a, depth=env.action_space.n)
23+
# add noise for stabilise training
24+
agent_a_one_hot += tf.random_normal(tf.shape(agent_a_one_hot), mean=0.2, stddev=0.1, dtype=tf.float32)/1.2
25+
agent_s_a = tf.concat([self.agent_s, agent_a_one_hot], axis=1) # 将agent的state和action进行拼接
26+
27+
with tf.variable_scope('network') as network_scope:
28+
prob_1 = self.construct_network(input=expert_s_a)
29+
network_scope.reuse_variables() # share parameter
30+
prob_2 = self.construct_network(input=agent_s_a)
31+
32+
with tf.variable_scope('loss'): # 其实就是对数损失函数,想把专家行为与agent行为进行区分
33+
loss_expert = tf.reduce_mean(tf.log(tf.clip_by_value(prob_1, 0.01, 1)))
34+
loss_agent = tf.reduce_mean(tf.log(tf.clip_by_value(1 - prob_2, 0.01, 1)))
35+
loss = loss_expert + loss_agent
36+
loss = -loss
37+
tf.summary.scalar('discriminator', loss)
38+
39+
optimizer = tf.train.AdamOptimizer()
40+
self.train_op = optimizer.minimize(loss)
41+
42+
self.rewards = tf.log(tf.clip_by_value(prob_2, 1e-10, 1)) # log(P(expert|s,a)) larger is better for agent
43+
44+
def construct_network(self, input):
45+
"""
46+
得到采取该动作的概率,对于专家的行动来说,D希望越大越好,对于agent的行动来说,D希望越小越好
47+
:param input:
48+
:return:
49+
"""
50+
layer_1 = tf.layers.dense(inputs=input, units=20, activation=tf.nn.leaky_relu, name='layer1')
51+
layer_2 = tf.layers.dense(inputs=layer_1, units=20, activation=tf.nn.leaky_relu, name='layer2')
52+
layer_3 = tf.layers.dense(inputs=layer_2, units=20, activation=tf.nn.leaky_relu, name='layer3')
53+
prob = tf.layers.dense(inputs=layer_3, units=1, activation=tf.sigmoid, name='prob')
54+
return prob
55+
56+
def train(self, expert_s, expert_a, agent_s, agent_a):
57+
return tf.get_default_session().run(self.train_op, feed_dict={self.expert_s: expert_s,
58+
self.expert_a: expert_a,
59+
self.agent_s: agent_s,
60+
self.agent_a: agent_a})
61+
62+
def get_rewards(self, agent_s, agent_a):
63+
"""
64+
返回agent得到的奖励,对于agent来说,希望D输出的概率越大越好
65+
:param agent_s:
66+
:param agent_a:
67+
:return:
68+
"""
69+
return tf.get_default_session().run(self.rewards, feed_dict={self.agent_s: agent_s,
70+
self.agent_a: agent_a})
71+
72+
def get_trainable_variables(self):
73+
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
74+
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import tensorflow as tf
2+
3+
4+
class Policy_net:
5+
def __init__(self, name: str, env):
6+
"""
7+
:param name: string
8+
:param env: gym env
9+
"""
10+
11+
ob_space = env.observation_space
12+
act_space = env.action_space
13+
14+
with tf.variable_scope(name):
15+
self.obs = tf.placeholder(dtype=tf.float32, shape=[None] + list(ob_space.shape), name='obs')
16+
17+
with tf.variable_scope('policy_net'):
18+
layer_1 = tf.layers.dense(inputs=self.obs, units=20, activation=tf.tanh)
19+
layer_2 = tf.layers.dense(inputs=layer_1, units=20, activation=tf.tanh)
20+
layer_3 = tf.layers.dense(inputs=layer_2, units=act_space.n, activation=tf.tanh)
21+
self.act_probs = tf.layers.dense(inputs=layer_3, units=act_space.n, activation=tf.nn.softmax)
22+
23+
with tf.variable_scope('value_net'):
24+
layer_1 = tf.layers.dense(inputs=self.obs, units=20, activation=tf.tanh)
25+
layer_2 = tf.layers.dense(inputs=layer_1, units=20, activation=tf.tanh)
26+
self.v_preds = tf.layers.dense(inputs=layer_2, units=1, activation=None)
27+
28+
self.act_stochastic = tf.multinomial(tf.log(self.act_probs), num_samples=1)
29+
self.act_stochastic = tf.reshape(self.act_stochastic, shape=[-1])
30+
31+
self.act_deterministic = tf.argmax(self.act_probs, axis=1)
32+
33+
self.scope = tf.get_variable_scope().name
34+
35+
def act(self, obs, stochastic=True):
36+
if stochastic:
37+
return tf.get_default_session().run([self.act_stochastic, self.v_preds], feed_dict={self.obs: obs})
38+
else:
39+
return tf.get_default_session().run([self.act_deterministic, self.v_preds], feed_dict={self.obs: obs})
40+
41+
def get_action_prob(self, obs):
42+
return tf.get_default_session().run(self.act_probs, feed_dict={self.obs: obs})
43+
44+
def get_variables(self):
45+
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
46+
47+
def get_trainable_variables(self):
48+
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
49+

RL/Basic-GAIL-Demo/run_gail.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
import argparse
2+
import gym
3+
import numpy as np
4+
import tensorflow as tf
5+
from network_models.policy_net import Policy_net
6+
from network_models.discriminator import Discriminator
7+
from algo.ppo import PPOTrain
8+
9+
def argparser():
10+
parser = argparse.ArgumentParser()
11+
parser.add_argument('--logdir', help='log directory', default='log/train/gail')
12+
parser.add_argument('--savedir', help='save directory', default='trained_models/gail')
13+
parser.add_argument('--gamma', default=0.95)
14+
parser.add_argument('--iteration', default=int(1e4))
15+
return parser.parse_args()
16+
17+
18+
def main(args):
19+
env = gym.make('CartPole-v0')
20+
env.seed(0)
21+
ob_space = env.observation_space
22+
Policy = Policy_net('policy', env)
23+
Old_Policy = Policy_net('old_policy', env)
24+
PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma)
25+
D = Discriminator(env)
26+
27+
# 得到专家的观测和行动
28+
expert_observations = np.genfromtxt('trajectory/observations.csv')
29+
expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32)
30+
31+
saver = tf.train.Saver()
32+
33+
with tf.Session() as sess:
34+
writer = tf.summary.FileWriter(args.logdir, sess.graph)
35+
sess.run(tf.global_variables_initializer())
36+
37+
obs = env.reset()
38+
success_num = 0
39+
40+
for iteration in range(args.iteration):
41+
observations = []
42+
actions = []
43+
rewards = []
44+
v_preds = []
45+
run_policy_steps = 0
46+
47+
while True:
48+
run_policy_steps += 1
49+
obs = np.stack([obs]).astype(dtype=np.float32)
50+
act, v_pred = Policy.act(obs = obs,stochastic = True)
51+
52+
act = np.asscalar(act)
53+
v_pred = np.asscalar(v_pred)
54+
55+
next_obs,reward,done,info = env.step(act)
56+
57+
observations.append(obs)
58+
actions.append(act)
59+
rewards.append(reward)
60+
v_preds.append(v_pred)
61+
62+
if done:
63+
next_obs = np.stack([next_obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs
64+
_, v_pred = Policy.act(obs=next_obs, stochastic=True)
65+
v_preds_next = v_preds[1:] + [np.asscalar(v_pred)]
66+
obs = env.reset()
67+
break
68+
else:
69+
obs = next_obs
70+
71+
writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)])
72+
, iteration)
73+
writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))])
74+
, iteration)
75+
76+
if sum(rewards) >= 195:
77+
success_num += 1
78+
if success_num >= 100:
79+
saver.save(sess, args.savedir + '/model.ckpt')
80+
print('Clear!! Model saved.')
81+
break
82+
else:
83+
success_num = 0
84+
85+
observations = np.reshape(observations,newshape=[-1] + list(ob_space.shape))
86+
actions = np.array(actions).astype(dtype = np.int32)
87+
88+
for i in range(2):
89+
D.train(expert_s = expert_observations,
90+
expert_a = expert_actions,
91+
agent_s = observations,
92+
agent_a = actions)
93+
94+
95+
d_rewards = D.get_rewards(agent_s=observations,agent_a = actions)
96+
d_rewards = np.reshape(d_rewards,newshape=[-1]).astype(dtype=np.float32)
97+
98+
gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next)
99+
gaes = np.array(gaes).astype(dtype=np.float32)
100+
# gaes = (gaes - gaes.mean()) / gaes.std()
101+
v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
102+
103+
# train policy
104+
inp = [observations, actions, gaes, d_rewards, v_preds_next]
105+
PPO.assign_policy_parameters()
106+
for epoch in range(6):
107+
sample_indices = np.random.randint(low=0, high=observations.shape[0],
108+
size=32) # indices are in [low, high)
109+
sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data
110+
PPO.train(obs=sampled_inp[0],
111+
actions=sampled_inp[1],
112+
gaes=sampled_inp[2],
113+
rewards=sampled_inp[3],
114+
v_preds_next=sampled_inp[4])
115+
116+
summary = PPO.get_summary(obs=inp[0],
117+
actions=inp[1],
118+
gaes=inp[2],
119+
rewards=inp[3],
120+
v_preds_next=inp[4])
121+
122+
writer.add_summary(summary, iteration)
123+
writer.close()
124+
125+
126+
if __name__ == '__main__':
127+
args = argparser()
128+
main(args)

0 commit comments

Comments
 (0)