@Article{Supelec909,
author = {Matthieu Geist},
title = {{Soft-max boosting}},
journal = {Machine Learning},
year = {2015},
volume = {100},
number = {2},
pages = {305-332},
note = {(I discovered after publication that a very similar approach has been published some time ago, see "an iterative method for multi-class cost-sensitive learning" by Abe, Zadrozny and Langford, KDD'04)},
url = {http://www.metz.supelec.fr//metz/personnel/geist_mat/pdfs/ml_sm_boost_rev.pdf},
abstract = {The standard multi-class classification risk, based on the binary
loss, is rarely directly minimized. This is due to (i) the lack
of convexity and (ii) the lack of smoothness (and even
continuity). The classic approach consists in minimizing instead
a convex surrogate. In this paper, we propose to replace the
usually considered deterministic decision rule by a stochastic
one, which allows obtaining a smooth risk (generalizing the
expected binary loss, and more generally the cost-sensitive
loss). Practically, this (empirical) risk is minimized by
performing a gradient descent in the function space linearly
spanned by a base learner (a.k.a. boosting). We provide a
convergence analysis of the resulting algorithm and experiment it
on a bunch of synthetic and real world data sets (with noiseless
and noisy domains, compared to convex and non convex boosters).}
}