Source code for Policies.Exp3R

# -*- coding: utf-8 -*-
r""" The Drift-Detection algorithm for non-stationary bandits.

- Reference: [["EXP3 with Drift Detection for the Switching Bandit Problem", Robin Allesiardo & Raphael Feraud]](https://www.researchgate.net/profile/Allesiardo_Robin/publication/281028960_EXP3_with_Drift_Detection_for_the_Switching_Bandit_Problem/links/55d1927808aee19936fdac8e.pdf)
- It runs on top of a simple policy like :class:`Exp3`, and :class:`DriftDetection_IndexPolicy` is a wrapper:

    >>> policy = DriftDetection_IndexPolicy(nbArms, C=1)
    >>> # use policy as usual, with policy.startGame(), r = policy.choice(), policy.getReward(arm, r)

- It uses an additional :math:`\mathcal{O}(\tau_\max)` memory for a game of maximum stationary length :math:`\tau_\max`.

.. warning:: It works on :class:`Exp3` or other parametrizations of the Exp3 policy, e.g., :class:`Exp3PlusPlus`.
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"


import numpy as np
from math import log, sqrt

try:
    from .CD_UCB import CD_IndexPolicy
    from .Exp3 import Exp3
    from .Exp3PlusPlus import Exp3PlusPlus
except ImportError:
    from CD_UCB import CD_IndexPolicy
    from Exp3 import Exp3
    from Exp3PlusPlus import Exp3PlusPlus


VERBOSE = True
#: Whether to be verbose when doing the search for valid parameter :math:`\ell`.
VERBOSE = False


CONSTANT_C = 1.0  #: The constant :math:`C` used in Corollary 1 of paper [["EXP3 with Drift Detection for the Switching Bandit Problem", Robin Allesiardo & Raphael Feraud]](https://www.researchgate.net/profile/Allesiardo_Robin/publication/281028960_EXP3_with_Drift_Detection_for_the_Switching_Bandit_Problem/links/55d1927808aee19936fdac8e.pdf).


[docs]class DriftDetection_IndexPolicy(CD_IndexPolicy):
    r""" The Drift-Detection generic policy for non-stationary bandits, using a custom Drift-Detection test, for 1-dimensional exponential families.

    - From [["EXP3 with Drift Detection for the Switching Bandit Problem", Robin Allesiardo & Raphael Feraud]](https://www.researchgate.net/profile/Allesiardo_Robin/publication/281028960_EXP3_with_Drift_Detection_for_the_Switching_Bandit_Problem/links/55d1927808aee19936fdac8e.pdf).
    """
[docs]    def __init__(self, nbArms,
            H=None, delta=None, C=CONSTANT_C,
            horizon=None, policy=Exp3,
            *args, **kwargs
        ):
        super(DriftDetection_IndexPolicy, self).__init__(nbArms, epsilon=1, policy=policy, *args, **kwargs)
        self.startGame()
        # New parameters
        self.horizon = horizon

        if H is None:
            H = int(np.ceil(C * np.sqrt(horizon * np.log(horizon))))
        assert H >= nbArms, "Error: for the Drift-Detection algorithm, the parameter H should be >= K = {}, but H = {}".format(nbArms, H)  # DEBUG
        self.H = H  #: Parameter :math:`H` for the Drift-Detection algorithm. Default value is :math:`\lceil C \sqrt{T \log(T)} \rceil`, for some constant :math:`C=` ``C`` (= :data:`CONSTANT_C` by default).

        if delta is None:
            delta = np.sqrt(np.log(horizon) / (nbArms * horizon))
        self.delta = delta  #: Parameter :math:`\delta` for the Drift-Detection algorithm. Default value is :math:`\sqrt{\frac{\log(T)}{K T}}` for :math:`K` arms and horizon :math:`T`.

        if 'gamma' not in kwargs:
            gamma = np.sqrt((nbArms * np.log(nbArms) * np.log(horizon)) / horizon)
            try:
                self.policy.gamma = gamma
            except AttributeError:
                print("Warning: the policy {} tried to use default value of gamma = {} but could not set attribute self.policy.gamma to gamma (maybe it's using an Exp3 with a non-constant value of gamma).".format(self, gamma))  # DEBUG

    # This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
    @property
    def proba_random_exploration(self):
        r"""Parameter :math:`\gamma` for the Exp3 algorithm."""
        return self.policy.gamma

    # This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
    @property
    def threshold_h(self):
        r"""Parameter :math:`\varepsilon` for the Drift-Detection algorithm.

        .. math:: \varepsilon = \sqrt{\frac{K \log(\frac{1}{\delta})}{2 \gamma H}}.
        """
        return 2 * sqrt((self.nbArms * log(1.0 / self.delta)) / (2 * self.proba_random_exploration * self.H))

    # This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
    @property
    def min_number_of_pulls_to_test_change(self):
        r"""Compute :math:`\Gamma_{\min}(I) := \frac{\gamma H}{K}`, the minimum number of samples we should have for all arms before testing for a change."""
        Gamma_min = self.proba_random_exploration * self.H / self.nbArms
        return int(np.ceil(Gamma_min))

[docs]    def __str__(self):
        return r"DriftDetection-{}($T={}$, $c={:.3g}$, $\alpha={:.3g}$)".format(self._policy.__name__, self.horizon, self.threshold_h, self.proba_random_exploration)

[docs]    def detect_change(self, arm, verbose=VERBOSE):
        r""" Detect a change in the current arm, using a Drift-Detection test (DD).

        .. math::

            k_{\max} &:= \arg\max_k \tilde{\rho}_k(t),\\
            DD_t(k) &= \hat{\mu}_k(I) - \hat{\mu}_{k_{\max}}(I).

        - The change is detected if there is an arm :math:`k` such that :math:`DD_t(k) \geq 2 * \varepsilon = h`, where :attr:`threshold_h` is the threshold of the test, and :math:`I` is the (number of the) current interval since the last (global) restart,
        - where :math:`\tilde{\rho}_k(t)` is the trust probability of arm :math:`k` from the Exp3 algorithm,
        - and where :math:`\hat{\mu}_k(I)` is the empirical mean of arm :math:`k` from the data in the current interval.

        .. warning::

            FIXME I know this implementation is not (yet) correct...
            I should count differently the samples we obtained from the Gibbs distribution (when Exp3 uses the trust vector) and from the uniform distribution
            This :math:`\Gamma_{\min}(I)` is the minimum number of samples obtained from the uniform exploration (of probability :math:`\gamma`).
            It seems painful to code correctly, I will do it later.
        """
        # XXX Do we have enough samples?
        min_pulls = np.min(self.last_pulls)
        if min_pulls < self.min_number_of_pulls_to_test_change:  # no we don't
            return False, None
        # Yes we do have enough samples
        trusts = self.policy.trusts
        k_max = np.argmax(trusts)
        means = [np.mean(rewards) for rewards in self.all_rewards]
        meanOfTrustedArm = means[k_max]
        for otherArm in range(self.nbArms):
            difference_of_mean = means[otherArm] - meanOfTrustedArm
            if verbose: print("  - For the mean mu(k={}) = {} and mean of trusted arm mu(k_max={}) = {}, their difference is {}, compared to c = {}...".format(otherArm, means[otherArm], k_max, meanOfTrustedArm, difference_of_mean, self.threshold_h))
            if difference_of_mean >= self.threshold_h:
                return True, None
        return False, None


# --- Exp3R

[docs]class Exp3R(DriftDetection_IndexPolicy):
    r""" The Exp3.R policy for non-stationary bandits.
    """

[docs]    def __init__(self, nbArms, policy=Exp3, *args, **kwargs):
        super(Exp3R, self).__init__(nbArms, policy=policy, *args, **kwargs)

[docs]    def __str__(self):
        return r"Exp3R($T={}$, $c={:.3g}$, $\alpha={:.3g}$)".format(self.horizon, self.threshold_h, self.proba_random_exploration)


# --- Exp3R++

[docs]class Exp3RPlusPlus(DriftDetection_IndexPolicy):
    r""" The Exp3.R++ policy for non-stationary bandits.
    """

[docs]    def __init__(self, nbArms, policy=Exp3PlusPlus, *args, **kwargs):
        super(Exp3RPlusPlus, self).__init__(nbArms, policy=policy, *args, **kwargs)

[docs]    def __str__(self):
        return r"Exp3R++($T={}$, $c={:.3g}$, $\alpha={:.3g}$)".format(self.horizon, self.threshold_h, self.proba_random_exploration)