Skip to content

Commit

Permalink
Update ppo_train.py
Browse files Browse the repository at this point in the history
  • Loading branch information
navuboy committed Sep 21, 2018
1 parent ae096bf commit e28e2c3
Showing 1 changed file with 0 additions and 114 deletions.
114 changes: 0 additions & 114 deletions cartpole-gazebo-ppo/cartpole_controller/src/ppo_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,19 +132,6 @@ def take_action(action):
robot_state.data = rospy.wait_for_message('/joint_states', JointState, timeout=5)
except:
print ('Error getting /joint_states data.')
# print "DATA : ",robot_state.data
# print "latest_reward: ", robot_state.latest_reward

# if len(robot_state.data.velocity) > 0:
# robot_state.cart_x_dot = robot_state.data.velocity[1]
# robot_state.pole_theta_dot = robot_state.data.velocity[0]
# else:
# robot_state.cart_x_dot = 0.0
# robot_state.pole_theta_dot = 0.0

# robot_state.cart_x = robot_state.data.position[1]
# robot_state.pole_theta = robot_state.data.position[0]


set_robot_state()

Expand Down Expand Up @@ -183,107 +170,6 @@ def listener():



# def softmax(x):
# e_x = np.exp(x - np.max(x))
# out = e_x / e_x.sum()
# return out


# def policy_gradient():
# with tf.variable_scope("policy"):
# params = tf.get_variable("policy_parameters",[4,2])
# state = tf.placeholder("float",[None,4])
# actions = tf.placeholder("float",[None,2])
# advantages = tf.placeholder("float",[None,1])
# linear = tf.matmul(state,params)
# probabilities = tf.nn.softmax(linear)
# good_probabilities = tf.reduce_sum(tf.multiply(probabilities, actions),reduction_indices=[1])
# eligibility = tf.log(good_probabilities) * advantages
# loss = -tf.reduce_sum(eligibility)
# optimizer = tf.train.AdamOptimizer(0.01).minimize(loss)
# return probabilities, state, actions, advantages, optimizer

# def value_gradient():
# with tf.variable_scope("value"):
# state = tf.placeholder("float",[None,4])
# newvals = tf.placeholder("float",[None,1])
# w1 = tf.get_variable("w1",[4,10])
# b1 = tf.get_variable("b1",[10])
# h1 = tf.nn.relu(tf.matmul(state,w1) + b1)
# w2 = tf.get_variable("w2",[10,1])
# b2 = tf.get_variable("b2",[1])
# calculated = tf.matmul(h1,w2) + b2
# diffs = calculated - newvals
# loss = tf.nn.l2_loss(diffs)
# optimizer = tf.train.AdamOptimizer(0.1).minimize(loss)
# return calculated, state, newvals, optimizer, loss


# def run_episode(policy_grad, value_grad, sess):
# pl_calculated, pl_state, pl_actions, pl_advantages, pl_optimizer = policy_grad
# vl_calculated, vl_state, vl_newvals, vl_optimizer, vl_loss = value_grad
# reset()
# observation = robot_state.robot_state
# totalreward = 0
# states = []
# actions = []
# advantages = []
# transitions = []
# update_vals = []


# for _ in range(20000):

# # calculate policy
# obs_vector = np.expand_dims(observation, axis=0)
# probs = sess.run(pl_calculated,feed_dict={pl_state: obs_vector})
# action = 0 if random.uniform(0,1) < probs[0][0] else 1
# # record the transition
# states.append(observation)
# # print("angle: ", observation[2]*180/3.14)
# actionblank = np.zeros(2)
# actionblank[action] = 1
# actions.append(actionblank)
# # take the action in the environment
# old_observation = observation
# reward, done = take_action(action)
# observation = robot_state.robot_state
# transitions.append((old_observation, action, reward))
# totalreward += reward

# if done:
# robot_state.done = False
# break
# for index, trans in enumerate(transitions):
# obs, action, reward = trans

# # calculate discounted monte-carlo return
# future_reward = 0
# future_transitions = len(transitions) - index
# decrease = 1
# for index2 in range(future_transitions):
# future_reward += transitions[(index2) + index][2] * decrease
# decrease = decrease * 0.97
# obs_vector = np.expand_dims(obs, axis=0)
# currentval = sess.run(vl_calculated,feed_dict={vl_state: obs_vector})[0][0]

# # advantage: how much better was this action than normal
# advantages.append(future_reward - currentval)

# # update the value function towards new return
# update_vals.append(future_reward)

# # update value function
# update_vals_vector = np.expand_dims(update_vals, axis=1)
# sess.run(vl_optimizer, feed_dict={vl_state: states, vl_newvals: update_vals_vector})
# # real_vl_loss = sess.run(vl_loss, feed_dict={vl_state: states, vl_newvals: update_vals_vector})

# advantages_vector = np.expand_dims(advantages, axis=1)
# sess.run(pl_optimizer, feed_dict={pl_state: states, pl_advantages: advantages_vector, pl_actions: actions})

# return totalreward


def main():
listener()
# env = gym.make('CartPole-v0')
Expand Down

0 comments on commit e28e2c3

Please sign in to comment.