Source code for Policies.PHE

# -*- coding: utf-8 -*-
""" The PHE, Perturbed-History Exploration, policy for bounded bandits.

- Reference: [[Perturbed-History Exploration in Stochastic Multi-Armed Bandits, by Branislav Kveton, Csaba Szepesvari, Mohammad Ghavamzadeh, Craig Boutilier, 26 Feb 2019, arXiv:1902.10089]](https://arxiv.org/abs/1902.10089)
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"

try:
    from .IndexPolicy import IndexPolicy
except ImportError:
    from IndexPolicy import IndexPolicy

from math import ceil
import numpy as np

#: By default, :math:`a` the perturbation scale in PHE is 1, that is, at current time step t, if there is :math:`s = T_{i,t-1}` samples of arm i, PHE generates :math:`s` pseudo-rewards (of mean :math:`1/2`)
DEFAULT_PERTURBATION_SCALE = 1.0


[docs]class PHE(IndexPolicy): """ The PHE, Perturbed-History Exploration, policy for bounded bandits. - Reference: [[Perturbed-History Exploration in Stochastic Multi-Armed Bandits, by Branislav Kveton, Csaba Szepesvari, Mohammad Ghavamzadeh, Craig Boutilier, 26 Feb 2019, arXiv:1902.10089]](https://arxiv.org/abs/1902.10089) - They prove that PHE achieves a regret of :math:`\mathcal{O}(K \Delta^{-1} \log(T))` regret for horizon :math:`T`, and if :math:`\Delta` is the minimum gap between the expected rewards of the optimal and suboptimal arms, for :math:`a > 1`. - Note that the limit case of :math:`a=0` gives the Follow-the-Leader algorithm (FTL), known to fail. """
[docs] def __init__(self, nbArms, perturbation_scale=DEFAULT_PERTURBATION_SCALE, lower=0., amplitude=1.): assert perturbation_scale > 0, "Error: for PHE class, the parameter perturbation_scale should be > 0, it was {}.".format(perturbation_scale) # DEBUG self.perturbation_scale = perturbation_scale #: Perturbation scale, denoted :math:`a` in their paper. Should be a float or int number. With :math:`s` current samples, :math:`\lceil a s \rceil` additional pseudo-rewards are generated. super(PHE, self).__init__(nbArms, lower=lower, amplitude=amplitude)
[docs] def __str__(self): return r"PHE($a={:.3g}$)".format(self.perturbation_scale)
[docs] def computeIndex(self, arm): """ Compute a randomized index by adding :math:`a` pseudo-rewards (of mean :math:`1/2`) to the current observations of this arm.""" s = self.pulls[arm] if s <= 0: return float('+inf') V_is = self.rewards[arm] number_of_perturbation = ceil(self.perturbation_scale * s) U_is = np.random.binomial(number_of_perturbation, 0.5) perturbated_mean = (V_is + U_is) / (s + number_of_perturbation) return perturbated_mean