Source code for Policies.Hedge

# -*- coding: utf-8 -*-
""" The Hedge randomized index policy.

Reference: [Regret Analysis of Stochastic and Nonstochastic Multi-armed Bandit Problems, S.Bubeck & N.Cesa-Bianchi](http://research.microsoft.com/en-us/um/people/sebubeck/SurveyBCB12.pdf)
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.7"

import numpy as np
import numpy.random as rn
try:
    from .BasePolicy import BasePolicy
except ImportError:
    from BasePolicy import BasePolicy

#: Default :math:`\varepsilon` parameter.
EPSILON = 0.01


[docs]class Hedge(BasePolicy): """ The Hedge randomized index policy. Reference: [Regret Analysis of Stochastic and Nonstochastic Multi-armed Bandit Problems, S.Bubeck & N.Cesa-Bianchi, §3.1](http://research.microsoft.com/en-us/um/people/sebubeck/SurveyBCB12.pdf). """
[docs] def __init__(self, nbArms, epsilon=EPSILON, lower=0., amplitude=1.): super(Hedge, self).__init__(nbArms, lower=lower, amplitude=amplitude) if epsilon is None: # Use a default value for the epsilon parameter epsilon = np.sqrt(np.log(nbArms) / nbArms) assert 0 < epsilon <= 1, "Error: the 'epsilon' parameter for Hedge class has to be in (0, 1]." # DEBUG self._epsilon = epsilon # Internal memory self.weights = np.full(nbArms, 1. / nbArms) #: Weights on the arms # trying to randomize the order of the initial visit to each arm; as this determinism breaks its habitility to play efficiently in multi-players games # XXX do even more randomized, take a random permutation of the arm ? self._initial_exploration = rn.permutation(nbArms)
# The proba that another player has the same is nbPlayers / factorial(nbArms) : should be SMALL !
[docs] def startGame(self): """Start with uniform weights.""" super(Hedge, self).startGame() self.weights.fill(1. / self.nbArms)
[docs] def __str__(self): return r"Hedge($\varepsilon: {:.3g}$)".format(self.epsilon)
# This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property @property def epsilon(self): r"""Constant :math:`\varepsilon_t = \varepsilon`.""" return self._epsilon @property def trusts(self): r"""Update the trusts probabilities according to Hedge formula, and the parameter :math:`\varepsilon_t`. .. math:: \mathrm{trusts}'_k(t+1) &= (1 - \varepsilon_t) w_k(t) + \varepsilon_t \frac{1}{K}, \\ \mathrm{trusts}(t+1) &= \mathrm{trusts}'(t+1) / \sum_{k=1}^{K} \mathrm{trusts}'_k(t+1). If :math:`w_k(t)` is the current weight from arm k. """ # Mixture between the weights and the uniform distribution trusts = self.weights # XXX Handle weird cases, slow down everything but safer! if not np.all(np.isfinite(trusts)): trusts[~np.isfinite(trusts)] = 0 # set bad values to 0 # Bad case, where the sum is so small that it's only rounding errors # or where all values where bad and forced to 0, start with trusts=[1/K...] if np.isclose(np.sum(trusts), 0): trusts[:] = 1.0 / self.nbArms # Normalize it and return it return trusts / np.sum(trusts)
[docs] def getReward(self, arm, reward): r"""Give a reward: accumulate rewards on that arm k, then update the weight :math:`w_k(t)` and renormalize the weights. .. math:: w'_k(t+1) &= w_k(t) \times \exp\left( \frac{\tilde{r}_k(t)}{\varepsilon_t N_k(t)} \right) \\ w(t+1) &= w'(t+1) / \sum_{k=1}^{K} w'_k(t+1). """ super(Hedge, self).getReward(arm, reward) # XXX Call to BasePolicy # Update weight of THIS arm, with this reward reward = (reward - self.lower) / self.amplitude loss = 1 - reward # Multiplicative weights self.weights[arm] *= np.exp(- loss * self.epsilon) # Renormalize weights at each step self.weights /= np.sum(self.weights)
# --- Choice methods
[docs] def choice(self): """One random selection, with probabilities = trusts, thank to :func:`numpy.random.choice`.""" # Force to first visit each arm once in the first steps if self.t < self.nbArms: # DONE we could use a random permutation instead of deterministic order! return self._initial_exploration[self.t] else: return rn.choice(self.nbArms, p=self.trusts)
[docs] def choiceWithRank(self, rank=1): """Multiple (rank >= 1) random selection, with probabilities = trusts, thank to :func:`numpy.random.choice`, and select the last one (less probable). - Note that if not enough entries in the trust vector are non-zero, then :func:`choice` is called instead (rank is ignored). """ if (self.t < self.nbArms) or (rank == 1) or np.sum(~np.isclose(self.trusts, 0)) < rank: return self.choice() else: return rn.choice(self.nbArms, size=rank, replace=False, p=self.trusts)[rank - 1]
[docs] def choiceFromSubSet(self, availableArms='all'): """One random selection, from availableArms, with probabilities = trusts, thank to :func:`numpy.random.choice`.""" if (self.t < self.nbArms) or (availableArms == 'all') or (len(availableArms) == self.nbArms): return self.choice() else: return rn.choice(availableArms, p=self.trusts[availableArms])
[docs] def choiceMultiple(self, nb=1): """Multiple (nb >= 1) random selection, with probabilities = trusts, thank to :func:`numpy.random.choice`.""" if (self.t < self.nbArms) or (nb == 1): return np.array([self.choice() for _ in range(nb)]) # good size if nb > 1 but t < nbArms else: return rn.choice(self.nbArms, size=nb, replace=False, p=self.trusts)
# --- Other methods
[docs] def estimatedOrder(self): """ Return the estimate order of the arms, as a permutation on [0..K-1] that would order the arms by increasing trust probabilities.""" return np.argsort(self.trusts)
[docs] def estimatedBestArms(self, M=1): """ Return a (non-necessarily sorted) list of the indexes of the M-best arms. Identify the set M-best.""" assert 1 <= M <= self.nbArms, "Error: the parameter 'M' has to be between 1 and K = {}, but it was {} ...".format(self.nbArms, M) # DEBUG order = self.estimatedOrder() return order[-M:]
# --- Two special cases
[docs]class HedgeWithHorizon(Hedge): r""" Hedge with fixed epsilon, :math:`\varepsilon_t = \varepsilon_0`, chosen with a knowledge of the horizon."""
[docs] def __init__(self, nbArms, horizon, lower=0., amplitude=1.): super(HedgeWithHorizon, self).__init__(nbArms, lower=lower, amplitude=amplitude) assert horizon > 0, "Error: the 'horizon' parameter for SoftmaxWithHorizon class has to be > 0." self.horizon = int(horizon) #: Parameter :math:`T` = known horizon of the experiment.
[docs] def __str__(self): return r"Hedge($T={}$)".format(self.horizon)
# This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property @property def epsilon(self): r""" Fixed temperature, small, knowing the horizon: :math:`\varepsilon_t = \sqrt(\frac{2 \log(K)}{T K})` (*heuristic*). - Cf. Theorem 3.1 case #1 of [Bubeck & Cesa-Bianchi, 2012](http://sbubeck.com/SurveyBCB12.pdf). """ return np.sqrt(2 * np.log(self.nbArms) / (self.horizon * self.nbArms))
[docs]class HedgeDecreasing(Hedge): r""" Hedge with decreasing parameter :math:`\varepsilon_t`."""
[docs] def __str__(self): return "Hedge(decreasing)"
# This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property @property def epsilon(self): r""" Decreasing epsilon with the time: :math:`\varepsilon_t = \min(\frac{1}{K}, \sqrt(\frac{\log(K)}{t K}))` (*heuristic*). - Cf. Theorem 3.1 case #2 of [Bubeck & Cesa-Bianchi, 2012](http://sbubeck.com/SurveyBCB12.pdf). """ return min(1. / self.nbArms, np.sqrt(np.log(self.nbArms) / (self.t * self.nbArms)))