# -*- coding: utf-8 -*-
""" The DMED policy of [Honda & Takemura, COLT 2010] in the special case of Bernoulli rewards (can be used on any [0,1]-valued rewards, but warning: in the non-binary case, this is not the algorithm of [Honda & Takemura, COLT 2010]) (see note below on the variant).
- Reference: [Garivier & Cappé - COLT, 2011](https://arxiv.org/pdf/1102.2490.pdf).
"""
from __future__ import division, print_function # Python 2 compatibility
__author__ = "Olivier Cappé, Aurélien Garivier, Lilian Besson"
__version__ = "0.6"
import numpy as np
np.seterr(divide='ignore') # XXX dangerous in general, controlled here!
try:
from .kullback import klBern
from .BasePolicy import BasePolicy
except ImportError:
from kullback import klBern
from BasePolicy import BasePolicy
#: Variant: with this set to false, use a less aggressive list pruning criterion corresponding to the version called DMED in [Garivier & Cappé, COLT 2011]; the default is the original proposal of [Honda & Takemura, COLT 2010] (called DMED+ in [Garivier & Cappé, COLT 2011])
# GENUINE = True
GENUINE = False
[docs]class DMED(BasePolicy):
""" The DMED policy of [Honda & Takemura, COLT 2010] in the special case of Bernoulli rewards (can be used on any [0,1]-valued rewards, but warning: in the non-binary case, this is not the algorithm of [Honda & Takemura, COLT 2010]) (see note below on the variant).
- Reference: [Garivier & Cappé - COLT, 2011](https://arxiv.org/pdf/1102.2490.pdf).
"""
[docs] def __init__(self, nbArms, genuine=GENUINE, tolerance=1e-4, kl=klBern, lower=0., amplitude=1.):
super(DMED, self).__init__(nbArms, lower=lower, amplitude=amplitude)
self.kl = np.vectorize(kl) #: kl function to use
self.kl.__name__ = kl.__name__
self.tolerance = tolerance #: Numerical tolerance
self.genuine = genuine #: Flag to know which variant is implemented, DMED or DMED+
self.nextActions = list(range(nbArms)) #: List of next actions to play, every next step is playing ``nextActions.pop(0)``
[docs] def __str__(self):
return r"DMED{}({})".format("$^+$" if self.genuine else "", self.kl.__name__[2:])
[docs] def startGame(self):
""" Initialize the policy for a new game."""
super(DMED, self).startGame()
self.nextActions = list(range(self.nbArms)) # Force exploring the initial actions
np.random.shuffle(self.nextActions) # In a random order,
[docs] def choice(self):
r""" If there is still a next action to play, pop it and play it, otherwise make new list and play first action.
The list of action is obtained as all the indexes :math:`k` satisfying the following equation.
- For the naive version (``genuine = False``), DMED:
.. math::
\mathrm{kl}(\hat{\mu}_k(t), \hat{\mu}^*(t)) < \frac{\log(t)}{N_k(t)}.
- For the original version (``genuine = True``), DMED+:
.. math::
\mathrm{kl}(\hat{\mu}_k(t), \hat{\mu}^*(t)) < \frac{\log(\frac{t}{N_k(t)})}{N_k(t)}.
Where :math:`X_k(t)` is the sum of rewards from arm k, :math:`\hat{\mu}_k(t)` is the empirical mean,
and :math:`\hat{\mu}^*(t)` is the best empirical mean.
.. math::
X_k(t) &= \sum_{\sigma=1}^{t} 1(A(\sigma) = k) r_k(\sigma) \\
\hat{\mu}_k(t) &= \frac{X_k(t)}{N_k(t)}, \\
\hat{\mu}^*(t) &= \max_{k=1}^{K} \hat{\mu}_k(t)
"""
if len(self.nextActions) == 0:
empiricalMeans = self.rewards / self.pulls
bestEmpiricalMean = np.max(empiricalMeans)
if self.genuine:
self.nextActions = np.nonzero(self.pulls * self.kl(empiricalMeans, bestEmpiricalMean) < np.log(self.t / self.pulls))[0]
else:
self.nextActions = np.nonzero(self.pulls * self.kl(empiricalMeans, bestEmpiricalMean) < np.log(self.t))[0]
self.nextActions = list(self.nextActions)
# Play next action
return self.nextActions.pop(0)
[docs] def choiceMultiple(self, nb=1):
""" If there is still enough actions to play, pop them and play them, otherwise make new list and play nb first actions."""
choices = [self.choice() for _ in range(nb)]
assert len(set(choices)) == nb, "Error: choiceMultiple({}) for DMED policy does not work yet...".format(nb) # DEBUG
# while len(set(choices)) < nb: # Not enough different actions? Try again.
# choices = [self.choice() for _ in range(nb)]
return choices
[docs]class DMEDPlus(DMED):
""" The DMED+ policy of [Honda & Takemura, COLT 2010] in the special case of Bernoulli rewards (can be used on any [0,1]-valued rewards, but warning: in the non-binary case, this is not the algorithm of [Honda & Takemura, COLT 2010]).
- Reference: [Garivier & Cappé - COLT, 2011](https://arxiv.org/pdf/1102.2490.pdf).
"""
[docs] def __init__(self, nbArms, tolerance=1e-4, kl=klBern, lower=0., amplitude=1.):
super(DMEDPlus, self).__init__(nbArms, genuine=True, tolerance=tolerance, kl=kl, lower=lower, amplitude=amplitude)