Simple Reinforcement Learning with Tensorflow: Part 2 - Policy-based Agents

Arthur Juliani

1.3K47

Hi Arthur,

I’m trying to implement your approach for 2 neurons as out put. Each neuron for each action. So now I directly use the probability of the action we take(0 or 1) from the network to compute the loss function.

But it doesn’t really work. the problem could be here:

loglik = tf.log(probability[0,input_y[0,0]])

Would you please check it?I use python 3.5.

Thank you.

importnumpyasnpimportpickleaspickleimporttensorflowastf#%matplotlib inlineimportmatplotlib.pyplotaspltimportmathimportgym

env = gym.make('CartPole-v0')

env.reset()

random_episodes = 0

reward_sum = 0whilerandom_episodes < 1:

env.render()

observation, reward, done, _ = env.step(np.random.randint(0,2))

reward_sum += reward

ifdone:

random_episodes += 1

print("Reward for this episode was:",reward_sum)

reward_sum = 0

env.reset()# hyperparametersH = 10# number of hidden layer neuronsbatch_size = 50# every how many episodes to do a param update?learning_rate = 1e-2# feel free to play with this to train faster or more stably.gamma = 0.99# discount factor for rewardD = 4# input dimensionalitytf.reset_default_graph()#This defines the network as it goes from taking an observation of the environment toobservations = tf.placeholder(tf.float32, [

#giving a probability of chosing to the action of moving left or right.None,D] , name="input_x")

W1 = tf.get_variable("W1", shape=[D, H],

initializer=tf.contrib.layers.xavier_initializer())

layer1 = tf.nn.relu(tf.matmul(observations,W1))

W2 = tf.get_variable("W2", shape=[H, 2],

initializer=tf.contrib.layers.xavier_initializer())

score = tf.matmul(layer1,W2)

probability = tf.nn.softmax(score)#From here we define the parts of the network needed for learning a good policy.tvars = tf.trainable_variables()

input_y = tf.placeholder(tf.int32,[None,1], name="input_y")

advantages = tf.placeholder(tf.float32,name="reward_signal")# The loss function. This sends the weights in the direction of making actionsloglik = tf.log(probability[0,input_y[0,0]])

# that gave good advantage (reward over time) more likely, and actions that didn't less likely.

#loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability))

loss = -tf.reduce_sum(loglik * advantages)

newGrads = tf.gradients(loss,tvars)# Once we have collected a series of gradients from multiple episodes, we apply them.adam = tf.train.AdamOptimizer(learning_rate=learning_rate)

# We don't just apply gradeients after every episode in order to account for noise in the reward signal.# Our optimizerW1Grad = tf.placeholder(tf.float32,name="batch_grad1")# Placeholders to send the final gradients through when we update.W2Grad = tf.placeholder(tf.float32,name="batch_grad2")

batchGrad = [W1Grad,W2Grad]

updateGrads = adam.apply_gradients(zip(batchGrad,tvars))defdiscount_rewards(r):

""" take 1D float array of rewards and compute discounted reward """discounted_r = np.zeros_like(r)

running_add = 0

fortinreversed(range(0, r.size)):

running_add = running_add * gamma + r[t]

discounted_r[t] = running_add

returndiscounted_r

xs, hs, dlogps, drs, ys, tfps = [], [], [], [], [], []

running_reward =Nonereward_sum = 0

episode_number = 1

total_episodes = 10000

init = tf.initialize_all_variables()# Launch the graphwithtf.Session()assess:

rendering =Falsesess.run(init)

observation = env.reset()# Obtain an initial observation of the environmentgradBuffer = sess.run(tvars)

# Reset the gradient placeholder. We will collect gradients in

# gradBuffer until we are ready to update our policy network.

forix, gradinenumerate(gradBuffer):

gradBuffer[ix] = grad * 0

whileepisode_number <= total_episodes:

# Rendering the environment slows things down,

# so let's only look at it once our agent is doing a good job.

ifreward_sum / batch_size > 100orrendering ==True:

env.render()

rendering =True

# Make sure the observation is in a shape the network can handle.x = np.reshape(observation, [1, D])

# Run the policy network and get an action to take.tfprob = sess.run(probability, feed_dict={observations: x})

#action = 1 if np.random.uniform() < tfprob else 0prob = np.reshape(tfprob, 2)

action = np.random.choice(a=[0, 1], p=prob)

xs.append(x)# observationys.append(action)

# y = 1 if action == 0 else 0 # a "fake label"

# step the environment and get new measurementsobservation, reward, done, info = env.step(action)

reward_sum += reward

drs.append(reward)# record reward (has to be done after we call step() to get reward for previous action)

ifdone:

episode_number += 1

# stack together all inputs, hidden states, action gradients, and rewards for this episodeepx = np.vstack(xs)

epy = np.vstack(ys)

epr = np.vstack(drs)

tfp = tfps

xs, hs, dlogps, drs, ys, tfps = [], [], [], [], [], []# reset array memorydiscounted_epr = discount_rewards(epr)

# compute the discounted reward backwards through time

# size the rewards to be unit normal (helps control the gradient estimator variance)discounted_epr -= np.mean(discounted_epr)

discounted_epr /= np.std(discounted_epr)

# Get the gradient for this episode, and save it in the gradBuffertGrad = sess.run(newGrads, feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})

forix, gradinenumerate(tGrad):

gradBuffer[ix] += grad

# If we have completed enough episodes, then update the policy network with our gradients.

ifepisode_number % batch_size == 0:

sess.run(updateGrads, feed_dict={W1Grad: gradBuffer[0], W2Grad: gradBuffer[1]})

forix, gradinenumerate(gradBuffer):

gradBuffer[ix] = grad * 0

# Give a summary of how well our network is doing for each batch of episodes.running_reward = reward_sum

ifrunning_rewardis None elserunning_reward * 0.99 + reward_sum * 0.01

print('Average reward for episode %f. Total average reward %f.'% (reward_sum / batch_size, running_reward / batch_size))

ifreward_sum / batch_size > 200:

print("Task solved in", episode_number,'episodes!')

breakreward_sum = 0

observation = env.reset()

print(episode_number,'Episodes completed.')