@Article{Supelec674,
author = {Olivier Pietquin and Helen Hastie},
title = {{A survey on metrics for the evaluation of user simulations}},
journal = {Knowledge Engineering Review},
year = {2013},
volume = {28},
number = {01},
pages = {59-73},
month = {February},
note = {first published as FirstView},
url = {http://www.metz.supelec.fr/metz/personnel/pietquin/pdf/KER_2013_OPHH.pdf},
doi = {10.1017/S0269888912000343},
abstract = {From the mid 90’s user simulation has become an important trend
of research in the field of spoken
dialogue systems (SDS) (Eckert et al., 1997; Zuckerman and
Albrecht, 2001; Georgila et al., 2005;
CuayAhuitl et al., 2005; Pietquin, 2006; Schatzmann et al.,
2007b; Janarthanam and Lemon,
2009b; Pietquin et al., 2009), because collecting and
annotating real human-machine interactions
is often expensive and time consuming. Yet, such data are
generally required for designing,
training and assessing dialogue systems (Levin et al., 2000;
Scheffler and Young, 2001; L´opez-
C´ozar et al., 2003; Pietquin and Dutoit, 2006; Schatzmann et
al., 2007a). Especially when using
machine learning methods for optimising dialogue management
strategies such as Reinforcement
Learning (RL) (Sutton and Barto, 1998), the amount of data
necessary for training is larger than
existing corpora. Indeed, exploring the whole dialogue state
space and strategy space requires a
number of interactions that increases exponentially with the
number of states while even simple
dialogue systems have continuous state spaces (because of the
inclusion of speech recognition and
understanding confidence levels into the state description).
User simulation is, therefore, necessary
to expand data sets. The general goal of a user simulation is
thus to produce as many as necessary
natural, varied and consistent interactions from as few data as
possible. The quality of the user
simulation is, therefore, of crucial importance because it
dramatically influences the results in
terms of SDS performance analysis and learnt strategy
(Schatzmann et al., 2005b). Assessment
of the quality of simulated dialogues and user simulation
methods is an open issue and, although
assessment metrics are required, there is no commonly adopted
metric (Schatzmann et al., 2005a;
Georgila et al., 2006). In this paper, we will first define a
list of desired features of a good user
simulation metric. Secondly, state-of-the-art of metrics
described in the literature are presented.}
}