4646
4747"""
4848import argparse
49- import os
5049import time
50+ import matplotlib .pyplot as plt
51+ import os
5152
5253import gym
53- import matplotlib .pyplot as plt
5454import numpy as np
5555import tensorflow as tf
5656
7878LR_A = 0.001 # learning rate for actor
7979LR_C = 0.01 # learning rate for critic
8080
81+
82+
8183############################### Actor-Critic ####################################
8284
8385
@@ -137,12 +139,13 @@ def __init__(self, state_dim, lr=0.01):
137139
138140 self .optimizer = tf .optimizers .Adam (lr )
139141
140- def learn (self , state , reward , state_ ):
142+ def learn (self , state , reward , state_ , done ):
143+ d = 0 if done else 1
141144 v_ = self .model (np .array ([state_ ]))
142145 with tf .GradientTape () as tape :
143146 v = self .model (np .array ([state ]))
144- ## TD_error = r + lambda * V(newS) - V(S)
145- td_error = reward + LAM * v_ - v
147+ ## TD_error = r + d * lambda * V(newS) - V(S)
148+ td_error = reward + d * LAM * v_ - v
146149 loss = tf .square (td_error )
147150 grad = tape .gradient (loss , self .model .trainable_weights )
148151 self .optimizer .apply_gradients (zip (grad , self .model .trainable_weights ))
@@ -203,7 +206,7 @@ def load(self): # load trained weights
203206 state_new , reward , done , info = env .step (action )
204207 state_new = state_new .astype (np .float32 )
205208
206- if done : reward = - 20 # reward shaping trick
209+ if done : reward = - 20 # reward shaping trick
207210 # these may helpful in some tasks
208211 # if abs(s_new[0]) >= env.observation_space.high[0]:
209212 # # cart moves more than 2.4 units from the center
@@ -215,7 +218,7 @@ def load(self): # load trained weights
215218
216219 try :
217220 td_error = critic .learn (
218- state , reward , state_new
221+ state , reward , state_new , done
219222 ) # learn Value-function : gradient = grad[r + lambda * V(s_new) - V(s)]
220223 actor .learn (state , action , td_error ) # learn Policy : true_gradient = grad[logPi(s, a) * td_error]
221224 except KeyboardInterrupt : # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn()
@@ -238,7 +241,7 @@ def load(self): # load trained weights
238241
239242 # Early Stopping for quick check
240243 if step >= MAX_STEPS :
241- print ("Early Stopping" ) # Hao Dong: it is important for this task
244+ print ("Early Stopping" ) # Hao Dong: it is important for this task
242245 break
243246 actor .save ()
244247 critic .save ()
0 commit comments