@Workshop{Supelec862,
author = {Matthieu Geist and Edouard Klein and Bilal PIOT and Yann Guermeur and Olivier Pietquin},
title = {{Around Inverse Reinforcement Learning and Score-based Classification}},
year = {2013},
booktitle = {{1st Multidisciplinary Conference on Reinforcement Learning and Decision Making (RLDM 2013)}},
month = {October},
address = {Princeton (USA)},
url = {http://www.metz.supelec.fr//metz/personnel/geist_mat/pdfs/supelec862.pdf},
abstract = {Inverse reinforcement learning (IRL) aims at estimating an unknown reward function optimized by some expert agent from interactions between this expert and the system to be controlled. One of its major application fields is imitation learning, where the goal is to imitate the expert, possibly in situations not encountered before. A classic and simple way to handle this problem is to see it as a classification problem, mapping states to actions. The potential issue with this approach is that classification does not take naturally into account the temporal structure of sequential decision making. Yet, many classification algorithms consist in learning a \textit {score function}, mapping state-action couples to values, such that the value of the action chosen by the expert is higher than the others. The \textit{decision rule} of the classifier maximizes the score over actions for a given state. This is curiously reminiscent of the \textit{state-action value function} in reinforcement learning, and of the associated \textit{greedy policy}. Based on this simple statement, we propose two IRL algorithms that incorporate the structure of the sequential decision making problem into some classifier in different ways. The first one, SCIRL (Structured Classification for IRL), starts from the observation that linearly parameterizing a reward function by some features imposes a linear parametrization of the Q-function by a so-called feature expectation. SCIRL simply uses (an estimate of) the expert feature expectation as the basis function of the score function. The second algorithm, CSI (Cascaded Supervised IRL), applies a reversed Bellman equation (expressing the reward as a function of the Q-function) to the score function outputted by any score-based classifier, which reduces to a simple (and generic) regression step. These two algorithms come with theoretical guarantees and perform competitively on toy problems. }
}