@InProceedings{Supelec902,
author = {Bilal PIOT and Matthieu Geist and Olivier Pietquin},
title = {{Difference of Convex Functions Programming for Reinforcement Learning}},
year = {2014},
booktitle = {{Advances in Neural Information Processing Systems (NIPS 2014)}},
url = {http://www.metz.supelec.fr//metz/personnel/geist_mat/pdfs/supelec902.pdf},
abstract = {Large Markov Decision Processes (MDPs) are usually solved using
Approximate Dynamic Programming (ADP) methods such as Approximate
Value Iteration (AVI) or Approximate Policy Iteration (API). The
main contribution of this paper is to show that, alternatively,
the optimal state-action value function can be estimated using
Difference of Convex functions (DC) Programming. To do so, we
study the minimization of a norm of the Optimal Bellman Residual
(OBR) $T^*Q-Q$, where $T^*$ is the so-called optimal Bellman
operator. Controlling this residual allows controlling the
distance to the optimal action-value function, and we show that
minimizing an empirical norm of the OBR is consistant in the
Vapnik sense. Finally, we frame this optimization problem as a DC
program. That allows envisioning using the large related
literature on DC Programming to address the Reinforcement Leaning
(RL) problem.}
}