Source code for Policies.RandomizedIndexPolicy

# -*- coding: utf-8 -*-
""" Generic randomized index policy.

- Reference: [["On the Optimality of Perturbations in Stochastic and Adversarial Multi-armed Bandit Problems", by Baekjin Kim, Ambuj Tewari, arXiv:1902.00610]](https://arxiv.org/pdf/1902.00610.pdf)
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"

import numpy as np
np.seterr(divide='ignore')  # XXX dangerous in general, controlled here!

try:
    from .IndexPolicy import IndexPolicy
except (ImportError, SystemError):
    from IndexPolicy import IndexPolicy


#: True to debug information about the perturbations
VERBOSE = True
VERBOSE = False


[docs]def uniform_perturbation(size=1, low=-1.0, high=1.0): r""" Uniform random perturbation, not from :math:`[0, 1]` but from :math:`[-1, 1]`, that is :math:`\mathcal{U}niform([-1, 1])`. - Reference: see Corollary 6 from [["On the Optimality of Perturbations in Stochastic and Adversarial Multi-armed Bandit Problems", by Baekjin Kim, Ambuj Tewari, arXiv:1902.00610]](https://arxiv.org/pdf/1902.00610.pdf) """ return np.random.uniform(low=low, high=high, size=size)
[docs]def normal_perturbation(size=1, loc=0.0, scale=0.25): r""" Normal (Gaussian) random perturbation, with mean ``loc=0`` and scale (sigma2) ``scale=0.25`` (by default), that is :math:`\mathcal{N}ormal(loc, scale)`. - Reference: see Corollary 6 from [["On the Optimality of Perturbations in Stochastic and Adversarial Multi-armed Bandit Problems", by Baekjin Kim, Ambuj Tewari, arXiv:1902.00610]](https://arxiv.org/pdf/1902.00610.pdf) """ return np.random.normal(loc=loc, scale=scale, size=size)
gaussian_perturbation = normal_perturbation
[docs]def exponential_perturbation(size=1, scale=0.25): r""" Exponential random perturbation, with parameter (:math:`\lambda`) ``scale=0.25`` (by default), that is :math:`\mathcal{E}xponential(\lambda)`. - Reference: see Corollary 7 from [["On the Optimality of Perturbations in Stochastic and Adversarial Multi-armed Bandit Problems", by Baekjin Kim, Ambuj Tewari, arXiv:1902.00610]](https://arxiv.org/pdf/1902.00610.pdf) """ return np.random.exponential(scale=scale, size=size)
[docs]def gumbel_perturbation(size=1, loc=0.0, scale=0.25): r""" Gumbel random perturbation, with mean ``loc=0`` and scale ``scale=0.25`` (by default), that is :math:`\mathcal{G}umbel(loc, scale)`. - Reference: see Corollary 7 from [["On the Optimality of Perturbations in Stochastic and Adversarial Multi-armed Bandit Problems", by Baekjin Kim, Ambuj Tewari, arXiv:1902.00610]](https://arxiv.org/pdf/1902.00610.pdf) """ return np.random.gumbel(loc=loc, scale=scale, size=size)
#: Map perturbation names (like ``"uniform"``) to perturbation functions (like :func:`uniform_perturbation`). map_perturbation_str_to_function = { "uniform": uniform_perturbation, "normal": normal_perturbation, "gaussian": gaussian_perturbation, "exponential": exponential_perturbation, "gumbel": gumbel_perturbation, }
[docs]class RandomizedIndexPolicy(IndexPolicy): """ Class that implements a generic randomized index policy."""
[docs] def __init__(self, nbArms, perturbation="uniform", lower=0., amplitude=1., *args, **kwargs): """ New generic index policy. - nbArms: the number of arms, - perturbation: ["uniform", "normal", "exponential", "gaussian"] or a function like :func:`numpy.random.uniform`, - lower, amplitude: lower value and known amplitude of the rewards. """ super(RandomizedIndexPolicy, self).__init__(nbArms, lower=lower, amplitude=amplitude, *args, **kwargs) if isinstance(perturbation, str): perturbation_name = perturbation perturbation = map_perturbation_str_to_function.get(perturbation_name, uniform_perturbation) else: perturbation_name = perturbation.__name__ self.perturbation_name = perturbation_name #: Name of the function to generate the random perturbation. self.perturbation = perturbation #: Function to generate the random perturbation.
[docs] def __str__(self): """ -> str""" return "{}({})".format(self.__class__.__name__, self.perturbation_name)
# --- Basic choice() method
[docs] def computeIndex(self, arm): r""" In a randomized index policy, with distribution :math:`\mathrm{Distribution}` generating perturbations :math:`Z_k(t)`, with index :math:`I_k(t)` and mean :math:`\hat{\mu}_k(t)` for each arm :math:`k`, it chooses an arm with maximal perturbated index (uniformly at random): .. math:: \hat{\mu}_k(t) &= \frac{X_k(t)}{N_k(t)}, \\ Z_k(t) &\sim \mathrm{Distribution}, \\ \mathrm{UCB}_k(t) &= I_k(t) - \hat{\mu}_k(t),\\ A(t) &\sim U(\arg\max_{1 \leq k \leq K} \hat{\mu}_k(t) + \mathrm{UCB}_k(t) \cdot Z_k(t)). """ if self.pulls[arm] < 1: return float('+inf') index = super(RandomizedIndexPolicy, self).computeIndex(arm) mean = self.rewards[arm] / self.pulls[arm] ucb = index - mean random_perturbation = self.perturbation() perturbated_index = mean + ucb * random_perturbation if VERBOSE: print(" - at time t = {}, policy {} would have used index = {} and mean = {}, but using its perturbation distribution ({}), it sampled a perturbation = {}, and the perturbated index was {} instead...".format(self.t, self, index, mean, self.perturbation_name, random_perturbation, perturbated_index)) # DEBUG self.index = perturbated_index
[docs] def computeAllIndex(self): r""" In a randomized index policy, with distribution :math:`\mathrm{Distribution}` generating perturbations :math:`Z_k(t)`, with index :math:`I_k(t)` and mean :math:`\hat{\mu}_k(t)` for each arm :math:`k`, it chooses an arm with maximal perturbated index (uniformly at random): .. math:: \hat{\mu}_k(t) &= \frac{X_k(t)}{N_k(t)}, \\ Z_k(t) &\sim \mathrm{Distribution}, \\ \mathrm{UCB}_k(t) &= I_k(t) - \hat{\mu}_k(t),\\ A(t) &\sim U(\arg\max_{1 \leq k \leq K} \hat{\mu}_k(t) + \mathrm{UCB}_k(t) \cdot Z_k(t)). """ super(RandomizedIndexPolicy, self).computeAllIndex() index = self.index means = self.rewards / self.pulls ucb = index - means random_perturbations = self.perturbation(size=self.nbArms) for arm in range(self.nbArms): perturbated_index = means[arm] + ucb[arm] * random_perturbations[arm] self.index[arm] = perturbated_index if self.pulls[arm] < 1: self.index[arm] = float('+inf') if VERBOSE: print(" - at time t = {}, policy {} would have used indexes = {} and means = {}, but using its perturbation distribution ({}), it sampled perturbations = {}, and the perturbated indexes was {} instead...".format(self.t, self, index, means, self.perturbation_name, random_perturbations, self.index)) # DEBUG