Source code for Policies.EpsilonGreedy

# -*- coding: utf-8 -*-
r""" The epsilon-greedy random policies, with the naive one and some variants.

- At every time step, a fully uniform random exploration has probability :math:`\varepsilon(t)` to happen, otherwise an exploitation is done on accumulated rewards (not means).
- Ref: https://en.wikipedia.org/wiki/Multi-armed_bandit#Semi-uniform_strategies

.. warning:: Except if :math:`\varepsilon(t)` is optimally tuned for a specific problem, none of these policies can hope to be efficient.
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"

from random import random
import numpy as np
import numpy.random as rn

try:
    from .BasePolicy import BasePolicy
    from .with_proba import with_proba
except ImportError:
    from BasePolicy import BasePolicy
    from with_proba import with_proba


#: Default value for epsilon for :class:`EpsilonGreedy`
EPSILON = 0.1


[docs]class EpsilonGreedy(BasePolicy):
    r""" The epsilon-greedy random policy.

    - At every time step, a fully uniform random exploration has probability :math:`\varepsilon(t)` to happen, otherwise an exploitation is done on accumulated rewards (not means).
    - Ref: https://en.wikipedia.org/wiki/Multi-armed_bandit#Semi-uniform_strategies
    """

[docs]    def __init__(self, nbArms, epsilon=EPSILON, lower=0., amplitude=1.):
        super(EpsilonGreedy, self).__init__(nbArms, lower=lower, amplitude=amplitude)
        # assert 0 <= epsilon <= 1, "Error: the 'epsilon' parameter for EpsilonGreedy class has to be in [0, 1]."  # DEBUG
        self._epsilon = epsilon

    # This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
    @property
    def epsilon(self):  # Allow child classes to use time-dependent epsilon coef
        return self._epsilon

[docs]    def __str__(self):
        return r"EpsilonGreedy($\varepsilon={:.3g}$)".format(self.epsilon)

[docs]    def choice(self):
        """With a probability of epsilon, explore (uniform choice), otherwhise exploit based on just accumulated *rewards* (not empirical mean rewards)."""
        if with_proba(self.epsilon):  # Proba epsilon : explore
            return rn.randint(0, self.nbArms - 1)
        else:  # Proba 1 - epsilon : exploit
            # Uniform choice among the best arms
            biased_means = self.rewards / (1 + self.pulls)
            # return rn.choice(np.nonzero(biased_means == np.max(biased_means))[0])
            # WARNING why max on rewards and not mean rewards?
            return rn.choice(np.nonzero(self.rewards == np.max(self.rewards))[0])

[docs]    def choiceWithRank(self, rank=1):
        """With a probability of epsilon, explore (uniform choice), otherwhise exploit with the rank, based on just accumulated *rewards* (not empirical mean rewards)."""
        if rank == 1:
            return self.choice()
        else:
            if with_proba(self.epsilon):  # Proba epsilon : explore
                return rn.randint(0, self.nbArms - 1)
            else:  # Proba 1 - epsilon : exploit
                sortedRewards = np.sort(self.rewards)
                chosenIndex = sortedRewards[-rank]
                # Uniform choice among the rank-th best arms
                return rn.choice(np.nonzero(self.rewards == chosenIndex)[0])

[docs]    def choiceFromSubSet(self, availableArms='all'):
        if (availableArms == 'all') or (len(availableArms) == self.nbArms):
            return self.choice()
        elif len(availableArms) == 0:
            print("WARNING: EpsilonGreedy.choiceFromSubSet({}): the argument availableArms of type {} should not be empty.".format(availableArms, type(availableArms)))  # DEBUG
            # WARNING if no arms are tagged as available, what to do ? choose an arm at random, or call choice() as if available == 'all'
            return self.choice()
            # return np.random.randint(self.nbArms)
        else:
            if with_proba(self.epsilon):  # Proba epsilon : explore
                return rn.choice(availableArms)
            else:  # Proba 1 - epsilon : exploit
                # Uniform choice among the best arms
                return rn.choice(np.nonzero(self.rewards[availableArms] == np.max(self.rewards[availableArms]))[0])

[docs]    def choiceMultiple(self, nb=1):
        if nb == 1:
            return np.array([self.choice()])
        else:
            # FIXME the explore/exploit balance should be for each choice, right?
            if with_proba(self.epsilon):  # Proba epsilon : Explore
                return rn.choice(self.nbArms, size=nb, replace=False)
            else:  # Proba 1 - epsilon : exploit
                sortedRewards = np.sort(self.rewards)
                # Uniform choice among the best arms
                return rn.choice(np.nonzero(self.rewards >= sortedRewards[-nb])[0], size=nb, replace=False)


# --- Epsilon-Decreasing

#: Default value for epsilon for :class:`EpsilonDecreasing`
EPSILON = 0.1


[docs]class EpsilonDecreasing(EpsilonGreedy):
    r""" The epsilon-decreasing random policy.

    - :math:`\varepsilon(t) = \min(1, \varepsilon_0 / \max(1, t))`
    - Ref: https://en.wikipedia.org/wiki/Multi-armed_bandit#Semi-uniform_strategies
    """

[docs]    def __init__(self, nbArms, epsilon=EPSILON, lower=0., amplitude=1.):
        super(EpsilonDecreasing, self).__init__(nbArms, lower=lower, amplitude=amplitude)
        # assert 0. <= epsilon <= 1., "Error: the 'epsilon' parameter for EpsilonDecreasing class has to be in [0, 1]."  # DEBUG
        self._epsilon = epsilon

[docs]    def __str__(self):
        return r"EpsilonDecreasing($\varepsilon_0={:.3g}$)".format(self._epsilon)

    # This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
    @property
    def epsilon(self):
        r"""Decreasing :math:`\varepsilon(t) = \min(1, \varepsilon_0 / \max(1, t))`."""
        return min(1, self._epsilon / max(1, self.t))


#  --- EpsilonDecreasingMEGA

C = 0.1  #: Constant C in the MEGA formula
D = 0.5  #: Constant C in the MEGA formula


[docs]def epsilon0(c, d, nbArms):
    r"""MEGA heuristic:

    .. math:: \varepsilon_0 = \frac{c K^2}{d^2 (K - 1)}.
    """
    return (c * nbArms**2) / (d**2 * (nbArms - 1))


[docs]class EpsilonDecreasingMEGA(EpsilonGreedy):
    r""" The epsilon-decreasing random policy, using MEGA's heuristic for a good choice of epsilon0 value.

    - :math:`\varepsilon(t) = \min(1, \varepsilon_0 / \max(1, t))`
    - :math:`\varepsilon_0 = \frac{c K^2}{d^2 (K - 1)}`
    - Ref: https://en.wikipedia.org/wiki/Multi-armed_bandit#Semi-uniform_strategies
    """

[docs]    def __init__(self, nbArms, c=C, d=D, lower=0., amplitude=1.):
        super(EpsilonDecreasingMEGA, self).__init__(nbArms, lower=lower, amplitude=amplitude)
        self._epsilon = epsilon0(c, d, nbArms)

[docs]    def __str__(self):
        return r"EpsilonDecreasingMEGA($\varepsilon=%.3g$)" % self._epsilon

    # This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
    @property
    def epsilon(self):
        r"""Decreasing :math:`\varepsilon(t) = \min(1, \varepsilon_0 / \max(1, t))`."""
        return min(1, self._epsilon / max(1, self.t))


# --- Epsilon-First

#: Default value for epsilon for :class:`EpsilonFirst`
EPSILON = 0.01


[docs]class EpsilonFirst(EpsilonGreedy):
    """ The epsilon-first random policy.
    Ref: https://en.wikipedia.org/wiki/Multi-armed_bandit#Semi-uniform_strategies
    """

[docs]    def __init__(self, nbArms, horizon, epsilon=EPSILON, lower=0., amplitude=1.):
        super(EpsilonFirst, self).__init__(nbArms, epsilon=epsilon, lower=lower, amplitude=amplitude)
        assert horizon > 0, "Error: the 'horizon' parameter for EpsilonFirst class has to be > 0."
        self.horizon = int(horizon)  #: Parameter :math:`T` = known horizon of the experiment.
        # assert 0 <= epsilon <= 1, "Error: the 'epsilon' parameter for EpsilonFirst class has to be in [0, 1]."  # DEBUG
        self._epsilon = epsilon

[docs]    def __str__(self):
        return r"EpsilonFirst($T={}$, $\varepsilon={:.3g}$)".format(self.horizon, self._epsilon)

    # This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
    @property
    def epsilon(self):
        r"""1 while :math:`t \leq \varepsilon_0 T`, 0 after."""
        if self.t <= self._epsilon * self.horizon:
            # First phase: randomly explore!
            return 1
        else:
            # Second phase: just exploit!
            return 0


#: Default value for epsilon for :class:`EpsilonDecreasing`
EPSILON = 0.1

#: Default value for the constant for the decreasing rate
DECREASINGRATE = 1e-6


[docs]class EpsilonExpDecreasing(EpsilonGreedy):
    r""" The epsilon exp-decreasing random policy.

    - :math:`\varepsilon(t) = \varepsilon_0 \exp(-t \mathrm{decreasingRate})`.
    - Ref: https://en.wikipedia.org/wiki/Multi-armed_bandit#Semi-uniform_strategies
    """

[docs]    def __init__(self, nbArms, epsilon=EPSILON, decreasingRate=DECREASINGRATE, lower=0., amplitude=1.):
        super(EpsilonExpDecreasing, self).__init__(nbArms, lower=lower, amplitude=amplitude)
        # assert 0 <= epsilon <= 1, "Error: the 'epsilon' parameter for EpsilonExpDecreasing class has to be in [0, 1]."  # DEBUG
        self._epsilon = epsilon
        assert decreasingRate > 0, "Error: the 'decreasingRate' parameter for EpsilonExpDecreasing class has to be > 0."
        self._decreasingRate = decreasingRate

[docs]    def __str__(self):
        return "EpsilonExpDecreasing(e:{}, r:{})".format(self._epsilon, self._decreasingRate)

    # This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
    @property
    def epsilon(self):
        r"""Decreasing :math:`\varepsilon(t) = \min(1, \varepsilon_0 \exp(- t \tau))`."""
        return min(1, self._epsilon * np.exp(- self.t * self._decreasingRate))