i am trying to train an A3C algorithm but I am getting same output in the multinomial function.
can I train the A3C with random actions as in below code.
can someone expert comment.
while count<max_timesteps-1:
value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))
prob = F.softmax(action_values,dim = -1)
log_prob = F.log_softmax(action_values, dim=-1)
print(log_prob.shape)
print("log_prob: ",log_prob)
entropy = -(log_prob * prob).sum(1, keepdim=True)
entropies.append(entropy)
actn = np.random.randn(3)
action = actn.argmax()
log_prob = log_prob[0,action]
# print("log_prob ",log_prob)
# print("action ",action)
state, reward, done = env.step(action)
done = (done or count == max_timesteps-2)
reward = max(min(reward, 1), -1)