Source code for Policies.DMED

# -*- coding: utf-8 -*-
""" The DMED policy of [Honda & Takemura, COLT 2010] in the special case of Bernoulli rewards (can be used on any [0,1]-valued rewards, but warning: in the non-binary case, this is not the algorithm of [Honda & Takemura, COLT 2010]) (see note below on the variant).

- Reference: [Garivier & Cappé - COLT, 2011](https://arxiv.org/pdf/1102.2490.pdf).
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Olivier Cappé, Aurélien Garivier, Lilian Besson"
__version__ = "0.6"

import numpy as np
np.seterr(divide='ignore')  # XXX dangerous in general, controlled here!

try:
    from .kullback import klBern
    from .BasePolicy import BasePolicy
except ImportError:
    from kullback import klBern
    from BasePolicy import BasePolicy


#: Variant: with this set to false, use a less aggressive list pruning criterion corresponding to the version called DMED in [Garivier & Cappé, COLT 2011]; the default is the original proposal of [Honda & Takemura, COLT 2010] (called DMED+ in [Garivier & Cappé, COLT 2011])
# GENUINE = True
GENUINE = False


[docs]class DMED(BasePolicy):
    """ The DMED policy of [Honda & Takemura, COLT 2010] in the special case of Bernoulli rewards (can be used on any [0,1]-valued rewards, but warning: in the non-binary case, this is not the algorithm of [Honda & Takemura, COLT 2010]) (see note below on the variant).

    - Reference: [Garivier & Cappé - COLT, 2011](https://arxiv.org/pdf/1102.2490.pdf).
    """

[docs]    def __init__(self, nbArms, genuine=GENUINE, tolerance=1e-4, kl=klBern, lower=0., amplitude=1.):
        super(DMED, self).__init__(nbArms, lower=lower, amplitude=amplitude)
        self.kl = np.vectorize(kl)  #: kl function to use
        self.kl.__name__ = kl.__name__
        self.tolerance = tolerance  #: Numerical tolerance
        self.genuine = genuine  #: Flag to know which variant is implemented, DMED or DMED+
        self.nextActions = list(range(nbArms))  #: List of next actions to play, every next step is playing ``nextActions.pop(0)``

[docs]    def __str__(self):
        return r"DMED{}({})".format("$^+$" if self.genuine else "", self.kl.__name__[2:])

[docs]    def startGame(self):
        """ Initialize the policy for a new game."""
        super(DMED, self).startGame()
        self.nextActions = list(range(self.nbArms))  # Force exploring the initial actions
        np.random.shuffle(self.nextActions)  # In a random order,

[docs]    def choice(self):
        r""" If there is still a next action to play, pop it and play it, otherwise make new list and play first action.

        The list of action is obtained as all the indexes :math:`k` satisfying the following equation.

        - For the naive version (``genuine = False``), DMED:

        .. math::

           \mathrm{kl}(\hat{\mu}_k(t), \hat{\mu}^*(t)) < \frac{\log(t)}{N_k(t)}.


        - For the original version (``genuine = True``), DMED+:

        .. math::

           \mathrm{kl}(\hat{\mu}_k(t), \hat{\mu}^*(t)) < \frac{\log(\frac{t}{N_k(t)})}{N_k(t)}.


        Where :math:`X_k(t)` is the sum of rewards from arm k, :math:`\hat{\mu}_k(t)` is the empirical mean,
        and :math:`\hat{\mu}^*(t)` is the best empirical mean.

        .. math::

           X_k(t) &= \sum_{\sigma=1}^{t} 1(A(\sigma) = k) r_k(\sigma) \\
           \hat{\mu}_k(t) &= \frac{X_k(t)}{N_k(t)}, \\
           \hat{\mu}^*(t) &= \max_{k=1}^{K} \hat{\mu}_k(t)
        """
        if len(self.nextActions) == 0:
            empiricalMeans = self.rewards / self.pulls
            bestEmpiricalMean = np.max(empiricalMeans)
            if self.genuine:
                self.nextActions = np.nonzero(self.pulls * self.kl(empiricalMeans, bestEmpiricalMean) < np.log(self.t / self.pulls))[0]
            else:
                self.nextActions = np.nonzero(self.pulls * self.kl(empiricalMeans, bestEmpiricalMean) < np.log(self.t))[0]
            self.nextActions = list(self.nextActions)
        # Play next action
        return self.nextActions.pop(0)

[docs]    def choiceMultiple(self, nb=1):
        """ If there is still enough actions to play, pop them and play them, otherwise make new list and play nb first actions."""
        choices = [self.choice() for _ in range(nb)]
        assert len(set(choices)) == nb, "Error: choiceMultiple({}) for DMED policy does not work yet...".format(nb)  # DEBUG
        # while len(set(choices)) < nb:  # Not enough different actions? Try again.
        #     choices = [self.choice() for _ in range(nb)]
        return choices


[docs]class DMEDPlus(DMED):
    """ The DMED+ policy of [Honda & Takemura, COLT 2010] in the special case of Bernoulli rewards (can be used on any [0,1]-valued rewards, but warning: in the non-binary case, this is not the algorithm of [Honda & Takemura, COLT 2010]).

    - Reference: [Garivier & Cappé - COLT, 2011](https://arxiv.org/pdf/1102.2490.pdf).
    """

[docs]    def __init__(self, nbArms, tolerance=1e-4, kl=klBern, lower=0., amplitude=1.):
        super(DMEDPlus, self).__init__(nbArms, genuine=True, tolerance=tolerance, kl=kl, lower=lower, amplitude=amplitude)