@InProceedings{Supelec763,
author = {Lucie Daubigney and Matthieu Geist and Olivier Pietquin},
title = {{Off-policy Learning in Large-scale POMDP-based Dialogue Systems}},
year = {2012},
booktitle = {{Proceedings of the 37th IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2012)}},
publisher = {IEEE},
pages = {4989 - 4992},
address = {Kyoto (Japan)},
url = {http://www.metz.supelec.fr//metz/personnel/geist_mat/pdfs/Supelec763.pdf},
abstract = {Reinforcement learning (RL) is now part of the state of the art
in
the domain of spoken dialogue systems (SDS) optimisation. Most
performant RL methods, such as those based on Gaussian
Processes,
require to test small changes in the policy to assess them as
improvements
or degradations. This process is called on policy learning.
Nevertheless, it can result in system behaviours that are not
acceptable
by users. Learning algorithms should ideally infer an optimal
strategy by observing interactions generated by a non-optimal
but
acceptable strategy, that is learning off-policy. Such methods
usually
fail to scale up and are thus not suited for real-world systems.
In this
contribution, a sample-efficient, online and off-policy RL
algorithm
is proposed to learn an optimal policy. This algorithm is
combined to
a compact non-linear value function representation (namely a
multilayers
perceptron) enabling to handle large scale systems.}
}