Source code for Policies.Softmax

# -*- coding: utf-8 -*-
""" The Boltzmann Exploration (Softmax) index policy.

- Reference: [Algorithms for the multi-armed bandit problem, V.Kuleshov & D.Precup, JMLR, 2008, §2.1](http://www.cs.mcgill.ca/~vkules/bandits.pdf) and [Boltzmann Exploration Done Right, N.Cesa-Bianchi & C.Gentile & G.Lugosi & G.Neu, arXiv 2017](https://arxiv.org/pdf/1705.10257.pdf).

- Very similar to Exp3 but uses a Boltzmann distribution.
  Reference: [Regret Analysis of Stochastic and Nonstochastic Multi-armed Bandit Problems, S.Bubeck & N.Cesa-Bianchi, §3.1](http://sbubeck.com/SurveyBCB12.pdf)
"""

__author__ = "Lilian Besson"
__version__ = "0.6"

import numpy as np
import numpy.random as rn

try:
    from .BasePolicy import BasePolicy
except ImportError:
    from BasePolicy import BasePolicy

#: self.unbiased is a flag to know if the rewards are used as biased estimator,
#: i.e., just :math:`r_t`, or unbiased estimators, :math:`r_t / trusts_t`.
UNBIASED = True
UNBIASED = False


[docs]class Softmax(BasePolicy):
    r"""The Boltzmann Exploration (Softmax) index policy, with a constant temperature :math:`\eta_t`.

    - Reference: [Algorithms for the multi-armed bandit problem, V.Kuleshov & D.Precup, JMLR, 2008, §2.1](http://www.cs.mcgill.ca/~vkules/bandits.pdf) and [Boltzmann Exploration Done Right, N.Cesa-Bianchi & C.Gentile & G.Lugosi & G.Neu, arXiv 2017](https://arxiv.org/pdf/1705.10257.pdf).

    - Very similar to Exp3 but uses a Boltzmann distribution.
      Reference: [Regret Analysis of Stochastic and Nonstochastic Multi-armed Bandit Problems, S.Bubeck & N.Cesa-Bianchi, §3.1](http://sbubeck.com/SurveyBCB12.pdf)
    """

[docs]    def __init__(self, nbArms, temperature=None, unbiased=UNBIASED, lower=0., amplitude=1.):
        super(Softmax, self).__init__(nbArms, lower=lower, amplitude=amplitude)
        if temperature is None:  # Use a default value for the temperature
            temperature = np.sqrt(np.log(nbArms) / nbArms)
        assert temperature > 0, "Error: the temperature parameter for Softmax class has to be > 0."
        self._temperature = temperature
        self.unbiased = unbiased  #: Flag
        # Trying to randomize the order of the initial visit to each arm; as this determinism breaks its habitility to play efficiently in multi-players games
        # Even more randomized, take a random permutation of the arm ?
        self._initial_exploration = rn.permutation(nbArms)
        # The proba that another player has the same is nbPlayers / factorial(nbArms) : should be SMALL !

[docs]    def startGame(self):
        """Nothing special to do."""
        super(Softmax, self).startGame()

[docs]    def __str__(self):
        return "Softmax(temp: {})".format(self.temperature)

    # This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
    @property
    def temperature(self):
        r"""Constant temperature, :math:`\eta_t`."""
        return self._temperature

    @property
    def trusts(self):
        r"""Update the trusts probabilities according to the Softmax (ie Boltzmann) distribution on accumulated rewards, and with the temperature :math:`\eta_t`.

        .. math::

           \mathrm{trusts}'_k(t+1) &= \exp\left( \frac{X_k(t)}{\eta_t N_k(t)} \right) \\
           \mathrm{trusts}(t+1) &= \mathrm{trusts}'(t+1) / \sum_{k=1}^{K} \mathrm{trusts}'_k(t+1).

        If :math:`X_k(t) = \sum_{\sigma=1}^{t} 1(A(\sigma) = k) r_k(\sigma)` is the sum of rewards from arm k.
        """
        # rewards = (self.rewards - self.lower) / self.amplitude  # XXX we don't need this, the BasePolicy.getReward does it already
        rewards = self.rewards
        trusts = np.exp(rewards / (self.temperature * (1 + self.pulls)))  # 1 + pulls to prevent division by 0
        if self.unbiased:
            rewards = self.rewards / trusts
            trusts = np.exp(rewards / (self.temperature * (1 + self.pulls)))  # 1 + pulls to prevent division by 0
        # XXX Handle weird cases, slow down everything but safer!
        if not np.all(np.isfinite(trusts)):
            trusts[~np.isfinite(trusts)] = 0  # set bad values to 0
        # Bad case, where the sum is so small that it's only rounding errors
        # or where all values where bad and forced to 0, start with trusts=[1/K...]
        if np.isclose(np.sum(trusts), 0):
            trusts[:] = 1.0 / self.nbArms
        # Normalize it and return it
        return trusts / np.sum(trusts)

    # --- Choice methods

[docs]    def choice(self):
        """One random selection, with probabilities = trusts, thank to :func:`numpy.random.choice`."""
        # Force to first visit each arm once in the first steps
        if self.t < self.nbArms:
            # return self.t  # random permutation instead of deterministic order!
            return self._initial_exploration[self.t]  # DONE
        else:
            return rn.choice(self.nbArms, p=self.trusts)

[docs]    def choiceWithRank(self, rank=1):
        """Multiple (rank >= 1) random selection, with probabilities = trusts, thank to :func:`numpy.random.choice`, and select the last one (least probable one).

        - Note that if not enough entries in the trust vector are non-zero, then :func:`choice` is called instead (rank is ignored).
        """
        if (self.t < self.nbArms) or (rank == 1) or np.sum(~np.isclose(self.trusts, 0)) < rank:
            return self.choice()
        else:
            return rn.choice(self.nbArms, size=rank, replace=False, p=self.trusts)[-1]

[docs]    def choiceFromSubSet(self, availableArms='all'):
        """One random selection, from availableArms, with probabilities = trusts, thank to :func:`numpy.random.choice`."""
        if self.t < self.nbArms:
            return availableArms[self.t % len(availableArms)]
        elif (availableArms == 'all') or (len(availableArms) == self.nbArms):
            return self.choice()
        else:
            return rn.choice(availableArms, p=self.trusts[availableArms])

[docs]    def choiceMultiple(self, nb=1):
        """Multiple (nb >= 1) random selection, with probabilities = trusts, thank to :func:`numpy.random.choice`."""
        if nb == 1:
            return np.array([self.choice()])
        else:
            return rn.choice(self.nbArms, size=nb, replace=False, p=self.trusts)

    # --- Other methods

[docs]    def estimatedOrder(self):
        """ Return the estimate order of the arms, as a permutation on [0..K-1] that would order the arms by increasing trust probabilities."""
        return np.argsort(self.trusts)


# --- Three special cases

[docs]class SoftmaxWithHorizon(Softmax):
    r""" Softmax with fixed temperature :math:`\eta_t = \eta_0` chosen with a knowledge of the horizon."""

[docs]    def __init__(self, nbArms, horizon, lower=0., amplitude=1.):
        super(SoftmaxWithHorizon, self).__init__(nbArms, lower=lower, amplitude=amplitude)
        assert horizon > 0, "Error: the 'horizon' parameter for SoftmaxWithHorizon class has to be > 0."
        self.horizon = int(horizon)  #: Parameter :math:`T` = known horizon of the experiment.

[docs]    def __str__(self):
        return "Softmax($T={}$)".format(self.horizon)

    # This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
    @property
    def temperature(self):
        r""" Fixed temperature, small, knowing the horizon: :math:`\eta_t = \sqrt(\frac{2 \log(K)}{T K})` (*heuristic*).

        - Cf. Theorem 3.1 case #1 of [Bubeck & Cesa-Bianchi, 2012](http://sbubeck.com/SurveyBCB12.pdf).
        """
        return np.sqrt(2 * np.log(self.nbArms) / (self.horizon * self.nbArms))


[docs]class SoftmaxDecreasing(Softmax):
    r""" Softmax with decreasing temperature :math:`\eta_t`."""

[docs]    def __str__(self):
        return "Softmax(decreasing)"

    # This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
    @property
    def temperature(self):
        r""" Decreasing temperature with the time: :math:`\eta_t = \sqrt(\frac{\log(K)}{t K})` (*heuristic*).

        - Cf. Theorem 3.1 case #2 of [Bubeck & Cesa-Bianchi, 2012](http://sbubeck.com/SurveyBCB12.pdf).
        """
        return np.sqrt(np.log(self.nbArms) / (self.t * self.nbArms))


[docs]class SoftMix(Softmax):
    r""" Another Softmax with decreasing temperature :math:`\eta_t`."""

[docs]    def __str__(self):
        return "SoftMix"

    # This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
    @property
    def temperature(self):
        r""" Decreasing temperature with the time: :math:`\eta_t = c \frac{\log(t)}{t}` (*heuristic*).

        - Cf. [Cesa-Bianchi & Fisher, 1998](http://dl.acm.org/citation.cfm?id=657473).
        - Default value for is :math:`c = \sqrt(\frac{\log(K)}{K})`.
        """
        c = np.sqrt(np.log(self.nbArms) / self.nbArms)
        if self.t <= 1:
            return c
        else:
            return c * np.log(self.t) / self.t