@InProceedings{Supelec694,
author = {Olivier Pietquin and Matthieu Geist and Senthilkumar Chandramohan},
title = {{Sample Efficient On-line Learning of Optimal Dialogue Policies with Kalman Temporal Differences}},
year = {2011},
booktitle = {{International Joint Conference on Artificial Intelligence (IJCAI 2011)}},
pages = {1878-1883},
month = {July},
note = {Oral Presentation},
address = {Barcelona, Spain},
url = {http://www.metz.supelec.fr//metz/personnel/pietquin/pdf/IJCAI_2011_OPMGSC.pdf},
abstract = {Designing dialog policies for voice-enabled interfaces
is a tailoring job that is most often left to
natural language processing experts. This job is
generally redone for every new dialog task because
cross-domain transfer is not possible. For this reason,
machine learning methods for dialog policy
optimization have been investigated during the last
15 years. Especially, reinforcement learning (RL)
is now part of the state of the art in this domain.
Standard RL methods require to test more or less
random changes in the policy on users to assess
them as improvements or degradations. This is
called on policy learning. Nevertheless, it can result
in system behaviors that are not acceptable
by users. Learning algorithms should ideally infer
an optimal strategy by observing interactions
generated by a non-optimal but acceptable strategy,
that is learning off-policy. In this contribution,
a sample-efficient, online and off-policy reinforcement
learning algorithm is proposed to learn an optimal
policy from few hundreds of dialogues generated
with a very simple handcrafted policy.}
}