Merge pull request #1085 from tensorlayer/reinforcement-learning

quantumiracle · web-flow · commit 8ee2bf8dc9db · 2020-05-30T16:48:36.000-04:00
Reinforcement learning fix bug
diff --git a/examples/reinforcement_learning/tutorial_AC.py b/examples/reinforcement_learning/tutorial_AC.py
@@ -46,11 +46,11 @@
 
 """
 import argparse
-import os
 import time
+import matplotlib.pyplot as plt
+import os
 
 import gym
-import matplotlib.pyplot as plt
 import numpy as np
 import tensorflow as tf
 
@@ -78,6 +78,8 @@
 LR_A = 0.001  # learning rate for actor
 LR_C = 0.01  # learning rate for critic
 
+
+
 ###############################  Actor-Critic  ####################################
 
 
@@ -137,12 +139,13 @@ def __init__(self, state_dim, lr=0.01):
 
         self.optimizer = tf.optimizers.Adam(lr)
 
-    def learn(self, state, reward, state_):
+    def learn(self, state, reward, state_, done):
+        d = 0 if done else 1
         v_ = self.model(np.array([state_]))
         with tf.GradientTape() as tape:
             v = self.model(np.array([state]))
-            ## TD_error = r + lambda * V(newS) - V(S)
-            td_error = reward + LAM * v_ - v
+            ## TD_error = r + d * lambda * V(newS) - V(S)
+            td_error = reward + d * LAM * v_ - v
             loss = tf.square(td_error)
         grad = tape.gradient(loss, self.model.trainable_weights)
         self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights))
@@ -203,7 +206,7 @@ def load(self):  # load trained weights
                 state_new, reward, done, info = env.step(action)
                 state_new = state_new.astype(np.float32)
 
-                if done: reward = -20  # reward shaping trick
+                if done: reward = -20   # reward shaping trick
                 # these may helpful in some tasks
                 # if abs(s_new[0]) >= env.observation_space.high[0]:
                 # #  cart moves more than 2.4 units from the center
@@ -215,7 +218,7 @@ def load(self):  # load trained weights
 
                 try:
                     td_error = critic.learn(
-                        state, reward, state_new
+                        state, reward, state_new, done
                     )  # learn Value-function : gradient = grad[r + lambda * V(s_new) - V(s)]
                     actor.learn(state, action, td_error)  # learn Policy : true_gradient = grad[logPi(s, a) * td_error]
                 except KeyboardInterrupt:  # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn()
@@ -238,7 +241,7 @@ def load(self):  # load trained weights
 
             # Early Stopping for quick check
             if step >= MAX_STEPS:
-                print("Early Stopping")  # Hao Dong: it is important for this task
+                print("Early Stopping")     # Hao Dong: it is important for this task
                 break
         actor.save()
         critic.save()
diff --git a/examples/reinforcement_learning/tutorial_DPPO.py b/examples/reinforcement_learning/tutorial_DPPO.py
@@ -37,8 +37,8 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import tensorflow as tf
-
 import tensorflow_probability as tfp
+
 import tensorlayer as tl
 
 parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
@@ -73,6 +73,7 @@
 # ppo-clip parameters
 EPSILON = 0.2
 
+
 ###############################  DPPO  ####################################
 
 
@@ -282,7 +283,10 @@ def work(self):
                 GLOBAL_UPDATE_COUNTER += 1  # count to minimum batch size, no need to wait other workers
                 if t == MAX_STEPS - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                     # finish patyh
-                    v_s_ = self.ppo.critic(np.array([s_], np.float32))[0][0]
+                    if done:
+                        v_s_ = 0
+                    else:
+                        v_s_ = self.ppo.critic(np.array([s_], np.float32))[0][0]
                     discounted_r = []  # compute discounted reward
                     for r in buffer_r[::-1]:
                         v_s_ = r + GAMMA * v_s_
@@ -304,8 +308,7 @@ def work(self):
 
             print(
                 'Training  | Episode: {}/{}  | Worker: {} | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
-                    GLOBAL_EP + 1, TRAIN_EPISODES, self.wid, ep_r,
-                    time.time() - T0
+                    GLOBAL_EP + 1, TRAIN_EPISODES, self.wid, ep_r, time.time() - T0
                 )
             )
             # record reward changes, plot later
@@ -372,6 +375,4 @@ def work(self):
             print(
                 'Testing  | Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
                     episode + 1, TEST_EPISODES, episode_reward,
-                    time.time() - T0
-                )
-            )
+                    time.time() - T0))
diff --git a/examples/reinforcement_learning/tutorial_PPO.py b/examples/reinforcement_learning/tutorial_PPO.py
@@ -30,8 +30,8 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import tensorflow as tf
-
 import tensorflow_probability as tfp
+
 import tensorlayer as tl
 
 parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
@@ -63,14 +63,14 @@
 # ppo-clip parameters
 EPSILON = 0.2
 
+
 ###############################  PPO  ####################################
 
 
 class PPO(object):
     """
     PPO class
     """
-
     def __init__(self, state_dim, action_dim, action_bound, method='clip'):
         # critic
         with tf.name_scope('critic'):
@@ -233,13 +233,16 @@ def store_transition(self, state, action, reward):
         self.action_buffer.append(action)
         self.reward_buffer.append(reward)
 
-    def finish_path(self, next_state):
+    def finish_path(self, next_state, done):
         """
         Calculate cumulative reward
         :param next_state:
         :return: None
         """
-        v_s_ = self.critic(np.array([next_state], np.float32))[0, 0]
+        if done:
+            v_s_ = 0
+        else:
+            v_s_ = self.critic(np.array([next_state], np.float32))[0, 0]
         discounted_r = []
         for r in self.reward_buffer[::-1]:
             v_s_ = r + GAMMA * v_s_
@@ -280,17 +283,15 @@ def finish_path(self, next_state):
                 episode_reward += reward
 
                 # update ppo
-                if (step + 1) % BATCH_SIZE == 0:
-                    agent.finish_path(state_)
+                if len(agent.state_buffer) >= BATCH_SIZE:
+                    agent.finish_path(state_, done)
                     agent.update()
                 if done:
                     break
-            agent.finish_path(state_)
+            agent.finish_path(state_, done)
             print(
                 'Training  | Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
-                    episode + 1, TRAIN_EPISODES, episode_reward,
-                    time.time() - t0
-                )
+                    episode + 1, TRAIN_EPISODES, episode_reward, time.time() - t0)
             )
             if episode == 0:
                 all_episode_reward.append(episode_reward)
@@ -318,6 +319,4 @@ def finish_path(self, next_state):
             print(
                 'Testing  | Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
                     episode + 1, TEST_EPISODES, episode_reward,
-                    time.time() - t0
-                )
-            )
+                    time.time() - t0))
diff --git a/examples/reinforcement_learning/tutorial_SAC.py b/examples/reinforcement_learning/tutorial_SAC.py
@@ -185,7 +185,7 @@ def evaluate(self, state, epsilon=1e-6):
         std = tf.math.exp(log_std)  # no clip in evaluation, clip affects gradients flow
 
         normal = Normal(0, 1)
-        z = normal.sample()
+        z = normal.sample(mean.shape)
         action_0 = tf.math.tanh(mean + std * z)  # TanhNormal distribution as actions; reparameterization trick
         action = self.action_range * action_0
         # according to original paper, with an extra last term for normalizing different action range
@@ -204,7 +204,7 @@ def get_action(self, state, greedy=False):
         std = tf.math.exp(log_std)
 
         normal = Normal(0, 1)
-        z = normal.sample()
+        z = normal.sample(mean.shape)
         action = self.action_range * tf.math.tanh(
             mean + std * z
         )  # TanhNormal distribution as actions; reparameterization trick