@Workshop{Supelec611,
author = {Matthieu Geist and Olivier Pietquin},
title = {{Revisiting natural actor-critics with value function approximation}},
year = {2010},
booktitle = {{Journées Francophones de Planification, Décision et Apprentissage pour la conduite de systèmes (JFPDA 2010)}},
month = {June},
note = {6 pages},
address = {Besançon (France)},
abstract = {Actor-critics architectures have become popular during the
last decade in the field of reinforcement learning because of
the introduction of the policy gradient with function
approximation
theorem. It allows combining rationally actor-critic
architectures with value function approximation and therefore
addressing large-scale problems. Recent researches led
to the replacement of policy gradient by a natural policy
gradient,
improving the efficiency of the corresponding algorithms.
However, a common drawback of these approaches is
that they require the manipulation of the so-called advantage
function which does not satisfy any Bellman equation.
Consequently,
derivation of actor-critic algorithms is not straightforward.
In this paper, we re-derive theorems in a way that
allows reasoning directly with the state-action value function
(or Q-function) and thus relying on the Bellman equation
again. Consequently, new forms of critics can easily be
integrated in the actor-critic framework.}
}