@Article{Supelec875,
author = {Matthieu Geist and Bruno Scherrer},
title = {{Off-policy Learning with Eligibility Traces: A Survey}},
journal = {Journal of Machine Learning Research (JMLR)},
year = {2014},
volume = {15},
pages = {289-333},
url = {http://jmlr.org/papers/v15/geist14a.html},
abstract = {In the framework of Markov Decision Processes, we consider linear
off-policy learning,
that is the problem of learning a linear approximation of the
value function of some fixed
policy from one trajectory possibly generated by some other
policy. We briefly review
on-policy learning algorithms of the literature (gradient-based
and least-squares-based),
adopting a unified algorithmic view. Then, we highlight a
systematic approach for adapting
them to off-policy learning with eligibility traces. This leads
to some known algorithms
– off-policy LSTD($\lambda$), LSPE($\lambda$),
TD($\lambda$), TDC/GQ($\lambda$) – and suggests new extensions –
off-policy FPKF($\lambda$), BRM($\lambda$), gBRM($\lambda$),
GTD2($\lambda$). We describe a comprehensive algorithmic
derivation of all algorithms in a recursive and memory-efficent
form, discuss their
known convergence properties and illustrate their relative
empirical behavior on Garnet
problems. Our experiments suggest that the most standard
algorithms on and off-policy
LSTD($\lambda$)/LSPE($\lambda$) – and TD($\lambda$) if the
feature space dimension is too large for a leastsquares
approach – perform the best.}
}