Source code for Policies.EpsilonGreedy

# -*- coding: utf-8 -*-
r""" The epsilon-greedy random policies, with the naive one and some variants.

- At every time step, a fully uniform random exploration has probability :math:`\varepsilon(t)` to happen, otherwise an exploitation is done on accumulated rewards (not means).
- Ref: https://en.wikipedia.org/wiki/Multi-armed_bandit#Semi-uniform_strategies

.. warning:: Except if :math:`\varepsilon(t)` is optimally tuned for a specific problem, none of these policies can hope to be efficient.
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"

from random import random
import numpy as np
import numpy.random as rn

try:
    from .BasePolicy import BasePolicy
    from .with_proba import with_proba
except ImportError:
    from BasePolicy import BasePolicy
    from with_proba import with_proba


#: Default value for epsilon for :class:`EpsilonGreedy`
EPSILON = 0.1


[docs]class EpsilonGreedy(BasePolicy): r""" The epsilon-greedy random policy. - At every time step, a fully uniform random exploration has probability :math:`\varepsilon(t)` to happen, otherwise an exploitation is done on accumulated rewards (not means). - Ref: https://en.wikipedia.org/wiki/Multi-armed_bandit#Semi-uniform_strategies """
[docs] def __init__(self, nbArms, epsilon=EPSILON, lower=0., amplitude=1.): super(EpsilonGreedy, self).__init__(nbArms, lower=lower, amplitude=amplitude) # assert 0 <= epsilon <= 1, "Error: the 'epsilon' parameter for EpsilonGreedy class has to be in [0, 1]." # DEBUG self._epsilon = epsilon
# This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property @property def epsilon(self): # Allow child classes to use time-dependent epsilon coef return self._epsilon
[docs] def __str__(self): return r"EpsilonGreedy($\varepsilon={:.3g}$)".format(self.epsilon)
[docs] def choice(self): """With a probability of epsilon, explore (uniform choice), otherwhise exploit based on just accumulated *rewards* (not empirical mean rewards).""" if with_proba(self.epsilon): # Proba epsilon : explore return rn.randint(0, self.nbArms - 1) else: # Proba 1 - epsilon : exploit # Uniform choice among the best arms biased_means = self.rewards / (1 + self.pulls) # return rn.choice(np.nonzero(biased_means == np.max(biased_means))[0]) # WARNING why max on rewards and not mean rewards? return rn.choice(np.nonzero(self.rewards == np.max(self.rewards))[0])
[docs] def choiceWithRank(self, rank=1): """With a probability of epsilon, explore (uniform choice), otherwhise exploit with the rank, based on just accumulated *rewards* (not empirical mean rewards).""" if rank == 1: return self.choice() else: if with_proba(self.epsilon): # Proba epsilon : explore return rn.randint(0, self.nbArms - 1) else: # Proba 1 - epsilon : exploit sortedRewards = np.sort(self.rewards) chosenIndex = sortedRewards[-rank] # Uniform choice among the rank-th best arms return rn.choice(np.nonzero(self.rewards == chosenIndex)[0])
[docs] def choiceFromSubSet(self, availableArms='all'): if (availableArms == 'all') or (len(availableArms) == self.nbArms): return self.choice() elif len(availableArms) == 0: print("WARNING: EpsilonGreedy.choiceFromSubSet({}): the argument availableArms of type {} should not be empty.".format(availableArms, type(availableArms))) # DEBUG # WARNING if no arms are tagged as available, what to do ? choose an arm at random, or call choice() as if available == 'all' return self.choice() # return np.random.randint(self.nbArms) else: if with_proba(self.epsilon): # Proba epsilon : explore return rn.choice(availableArms) else: # Proba 1 - epsilon : exploit # Uniform choice among the best arms return rn.choice(np.nonzero(self.rewards[availableArms] == np.max(self.rewards[availableArms]))[0])
[docs] def choiceMultiple(self, nb=1): if nb == 1: return np.array([self.choice()]) else: # FIXME the explore/exploit balance should be for each choice, right? if with_proba(self.epsilon): # Proba epsilon : Explore return rn.choice(self.nbArms, size=nb, replace=False) else: # Proba 1 - epsilon : exploit sortedRewards = np.sort(self.rewards) # Uniform choice among the best arms return rn.choice(np.nonzero(self.rewards >= sortedRewards[-nb])[0], size=nb, replace=False)
# --- Epsilon-Decreasing #: Default value for epsilon for :class:`EpsilonDecreasing` EPSILON = 0.1
[docs]class EpsilonDecreasing(EpsilonGreedy): r""" The epsilon-decreasing random policy. - :math:`\varepsilon(t) = \min(1, \varepsilon_0 / \max(1, t))` - Ref: https://en.wikipedia.org/wiki/Multi-armed_bandit#Semi-uniform_strategies """
[docs] def __init__(self, nbArms, epsilon=EPSILON, lower=0., amplitude=1.): super(EpsilonDecreasing, self).__init__(nbArms, lower=lower, amplitude=amplitude) # assert 0. <= epsilon <= 1., "Error: the 'epsilon' parameter for EpsilonDecreasing class has to be in [0, 1]." # DEBUG self._epsilon = epsilon
[docs] def __str__(self): return r"EpsilonDecreasing($\varepsilon_0={:.3g}$)".format(self._epsilon)
# This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property @property def epsilon(self): r"""Decreasing :math:`\varepsilon(t) = \min(1, \varepsilon_0 / \max(1, t))`.""" return min(1, self._epsilon / max(1, self.t))
# --- EpsilonDecreasingMEGA C = 0.1 #: Constant C in the MEGA formula D = 0.5 #: Constant C in the MEGA formula
[docs]def epsilon0(c, d, nbArms): r"""MEGA heuristic: .. math:: \varepsilon_0 = \frac{c K^2}{d^2 (K - 1)}. """ return (c * nbArms**2) / (d**2 * (nbArms - 1))
[docs]class EpsilonDecreasingMEGA(EpsilonGreedy): r""" The epsilon-decreasing random policy, using MEGA's heuristic for a good choice of epsilon0 value. - :math:`\varepsilon(t) = \min(1, \varepsilon_0 / \max(1, t))` - :math:`\varepsilon_0 = \frac{c K^2}{d^2 (K - 1)}` - Ref: https://en.wikipedia.org/wiki/Multi-armed_bandit#Semi-uniform_strategies """
[docs] def __init__(self, nbArms, c=C, d=D, lower=0., amplitude=1.): super(EpsilonDecreasingMEGA, self).__init__(nbArms, lower=lower, amplitude=amplitude) self._epsilon = epsilon0(c, d, nbArms)
[docs] def __str__(self): return r"EpsilonDecreasingMEGA($\varepsilon=%.3g$)" % self._epsilon
# This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property @property def epsilon(self): r"""Decreasing :math:`\varepsilon(t) = \min(1, \varepsilon_0 / \max(1, t))`.""" return min(1, self._epsilon / max(1, self.t))
# --- Epsilon-First #: Default value for epsilon for :class:`EpsilonFirst` EPSILON = 0.01
[docs]class EpsilonFirst(EpsilonGreedy): """ The epsilon-first random policy. Ref: https://en.wikipedia.org/wiki/Multi-armed_bandit#Semi-uniform_strategies """
[docs] def __init__(self, nbArms, horizon, epsilon=EPSILON, lower=0., amplitude=1.): super(EpsilonFirst, self).__init__(nbArms, epsilon=epsilon, lower=lower, amplitude=amplitude) assert horizon > 0, "Error: the 'horizon' parameter for EpsilonFirst class has to be > 0." self.horizon = int(horizon) #: Parameter :math:`T` = known horizon of the experiment. # assert 0 <= epsilon <= 1, "Error: the 'epsilon' parameter for EpsilonFirst class has to be in [0, 1]." # DEBUG self._epsilon = epsilon
[docs] def __str__(self): return r"EpsilonFirst($T={}$, $\varepsilon={:.3g}$)".format(self.horizon, self._epsilon)
# This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property @property def epsilon(self): r"""1 while :math:`t \leq \varepsilon_0 T`, 0 after.""" if self.t <= self._epsilon * self.horizon: # First phase: randomly explore! return 1 else: # Second phase: just exploit! return 0
#: Default value for epsilon for :class:`EpsilonDecreasing` EPSILON = 0.1 #: Default value for the constant for the decreasing rate DECREASINGRATE = 1e-6
[docs]class EpsilonExpDecreasing(EpsilonGreedy): r""" The epsilon exp-decreasing random policy. - :math:`\varepsilon(t) = \varepsilon_0 \exp(-t \mathrm{decreasingRate})`. - Ref: https://en.wikipedia.org/wiki/Multi-armed_bandit#Semi-uniform_strategies """
[docs] def __init__(self, nbArms, epsilon=EPSILON, decreasingRate=DECREASINGRATE, lower=0., amplitude=1.): super(EpsilonExpDecreasing, self).__init__(nbArms, lower=lower, amplitude=amplitude) # assert 0 <= epsilon <= 1, "Error: the 'epsilon' parameter for EpsilonExpDecreasing class has to be in [0, 1]." # DEBUG self._epsilon = epsilon assert decreasingRate > 0, "Error: the 'decreasingRate' parameter for EpsilonExpDecreasing class has to be > 0." self._decreasingRate = decreasingRate
[docs] def __str__(self): return "EpsilonExpDecreasing(e:{}, r:{})".format(self._epsilon, self._decreasingRate)
# This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property @property def epsilon(self): r"""Decreasing :math:`\varepsilon(t) = \min(1, \varepsilon_0 \exp(- t \tau))`.""" return min(1, self._epsilon * np.exp(- self.t * self._decreasingRate))