Update ppo_train.py

navuboy · Sep 21, 2018 · e28e2c3 · e28e2c3
1 parent ae096bf
commit e28e2c3
Showing 1 changed file with 0 additions and 114 deletions.
diff --git a/cartpole-gazebo-ppo/cartpole_controller/src/ppo_train.py b/cartpole-gazebo-ppo/cartpole_controller/src/ppo_train.py
@@ -132,19 +132,6 @@ def take_action(action):
  robot_state.data = rospy.wait_for_message('/joint_states', JointState, timeout=5)
  except:
  print ('Error getting /joint_states data.')
- # print "DATA : ",robot_state.data
- # print "latest_reward: ", robot_state.latest_reward
-
- # if len(robot_state.data.velocity) > 0:
- # robot_state.cart_x_dot = robot_state.data.velocity[1]
- # robot_state.pole_theta_dot = robot_state.data.velocity[0]
- # else:
- # robot_state.cart_x_dot = 0.0
- # robot_state.pole_theta_dot = 0.0
-
- # robot_state.cart_x = robot_state.data.position[1]
- # robot_state.pole_theta = robot_state.data.position[0]
-
 
  set_robot_state()
 
@@ -183,107 +170,6 @@ def listener():
 
 
 
-# def softmax(x):
-# e_x = np.exp(x - np.max(x))
-# out = e_x / e_x.sum()
-# return out
-
-
-# def policy_gradient():
-# with tf.variable_scope("policy"):
-# params = tf.get_variable("policy_parameters",[4,2])
-# state = tf.placeholder("float",[None,4])
-# actions = tf.placeholder("float",[None,2])
-# advantages = tf.placeholder("float",[None,1])
-# linear = tf.matmul(state,params)
-# probabilities = tf.nn.softmax(linear)
-# good_probabilities = tf.reduce_sum(tf.multiply(probabilities, actions),reduction_indices=[1])
-# eligibility = tf.log(good_probabilities) * advantages
-# loss = -tf.reduce_sum(eligibility)
-# optimizer = tf.train.AdamOptimizer(0.01).minimize(loss)
-# return probabilities, state, actions, advantages, optimizer
-
-# def value_gradient():
-# with tf.variable_scope("value"):
-# state = tf.placeholder("float",[None,4])
-# newvals = tf.placeholder("float",[None,1])
-# w1 = tf.get_variable("w1",[4,10])
-# b1 = tf.get_variable("b1",[10])
-# h1 = tf.nn.relu(tf.matmul(state,w1) + b1)
-# w2 = tf.get_variable("w2",[10,1])
-# b2 = tf.get_variable("b2",[1])
-# calculated = tf.matmul(h1,w2) + b2
-# diffs = calculated - newvals
-# loss = tf.nn.l2_loss(diffs)
-# optimizer = tf.train.AdamOptimizer(0.1).minimize(loss)
-# return calculated, state, newvals, optimizer, loss
-
-
-# def run_episode(policy_grad, value_grad, sess):
-# pl_calculated, pl_state, pl_actions, pl_advantages, pl_optimizer = policy_grad
-# vl_calculated, vl_state, vl_newvals, vl_optimizer, vl_loss = value_grad
-# reset()
-# observation = robot_state.robot_state
-# totalreward = 0
-# states = []
-# actions = []
-# advantages = []
-# transitions = []
-# update_vals = []
-
-
-# for _ in range(20000):
-
-# # calculate policy
-# obs_vector = np.expand_dims(observation, axis=0)
-# probs = sess.run(pl_calculated,feed_dict={pl_state: obs_vector})
-# action = 0 if random.uniform(0,1) < probs[0][0] else 1
-# # record the transition
-# states.append(observation)
-# # print("angle: ", observation[2]*180/3.14)
-# actionblank = np.zeros(2)
-# actionblank[action] = 1
-# actions.append(actionblank)
-# # take the action in the environment
-# old_observation = observation
-# reward, done = take_action(action)
-# observation = robot_state.robot_state
-# transitions.append((old_observation, action, reward))
-# totalreward += reward
-
-# if done:
-# robot_state.done = False
-# break
-# for index, trans in enumerate(transitions):
-# obs, action, reward = trans
-
-# # calculate discounted monte-carlo return
-# future_reward = 0
-# future_transitions = len(transitions) - index
-# decrease = 1
-# for index2 in range(future_transitions):
-# future_reward += transitions[(index2) + index][2] * decrease
-# decrease = decrease * 0.97
-# obs_vector = np.expand_dims(obs, axis=0)
-# currentval = sess.run(vl_calculated,feed_dict={vl_state: obs_vector})[0][0]
-
-# # advantage: how much better was this action than normal
-# advantages.append(future_reward - currentval)
-
-# # update the value function towards new return
-# update_vals.append(future_reward)
-
-# # update value function
-# update_vals_vector = np.expand_dims(update_vals, axis=1)
-# sess.run(vl_optimizer, feed_dict={vl_state: states, vl_newvals: update_vals_vector})
-# # real_vl_loss = sess.run(vl_loss, feed_dict={vl_state: states, vl_newvals: update_vals_vector})
-
-# advantages_vector = np.expand_dims(advantages, axis=1)
-# sess.run(pl_optimizer, feed_dict={pl_state: states, pl_advantages: advantages_vector, pl_actions: actions})
-
-# return totalreward
-
-
 def main():
  listener()
  # env = gym.make('CartPole-v0')