Source code for Policies.OCUCBH

# -*- coding: utf-8 -*-
""" The Optimally Confident UCB (OC-UCB) policy for bounded stochastic bandits. Initial version (horizon-dependent).

- Reference: [Lattimore, 2015](https://arxiv.org/pdf/1507.07880.pdf)
- There is also a horizon-independent version, :class:`OCUCB.OCUCB`, from  [Lattimore, 2016](https://arxiv.org/pdf/1603.08661.pdf).
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"

from math import exp, sqrt, log
import numpy as np
np.seterr(divide='ignore')  # XXX dangerous in general, controlled here!

try:
    from .OCUCB import OCUCB
except ImportError:
    from OCUCB import OCUCB

#: Default value for parameter :math:`\psi \geq 2` for OCUCBH.
PSI = 2

#: Default value for parameter :math:`\alpha \geq 2` for OCUCBH.
ALPHA = 4


# --- OCUCBH


[docs]class OCUCBH(OCUCB): """ The Optimally Confident UCB (OC-UCB) policy for bounded stochastic bandits. Initial version (horizon-dependent). - Reference: [Lattimore, 2015](https://arxiv.org/pdf/1507.07880.pdf) """
[docs] def __init__(self, nbArms, horizon=None, psi=PSI, alpha=ALPHA, lower=0., amplitude=1.): super(OCUCBH, self).__init__(nbArms, lower=lower, amplitude=amplitude) assert psi >= 2, "Error: parameter 'psi' for OCUCBH algorithm has to be >= 2." # DEBUG self.psi = psi #: Parameter :math:`\psi \geq 2`. assert alpha >= 2, "Error: parameter 'alpha' for OCUCBH algorithm has to be in >= 2." # DEBUG self.alpha = alpha #: Parameter :math:`\alpha \geq 2`. assert horizon > 1, "Error: parameter 'psi' for OCUCBH algorithm has to be > 1." # DEBUG self.horizon = int(horizon) #: Horizon T.
[docs] def __str__(self): return r"OC-UCB-H($\alpha={:.3g}$, $\psi={:.3g}$, $T={}$)".format(self.alpha, self.psi, self.horizon)
[docs] def computeIndex(self, arm): r""" Compute the current index, at time t and after :math:`N_k(t)` pulls of arm k: .. math:: I_k(t) = \frac{X_k(t)}{N_k(t)} + \sqrt{\frac{\alpha}{N_k(t)} \log(\frac{\psi T}{t})}. - Where :math:`\alpha` and :math:`\psi` are two parameters of the algorithm. """ if self.pulls[arm] < 1: return float('+inf') else: return (self.rewards[arm] / self.pulls[arm]) + sqrt((self.alpha / self.pulls[arm]) * log(self.psi * self.horizon / self.t) )
# XXX Error : division by zero ? # def computeAllIndex(self): # """ Compute the current indexes for all arms, in a vectorized manner.""" # indexes = (self.rewards / self.pulls) + np.sqrt((self.alpha / self.pulls) * np.log(self.psi * self.horizon / self.t) ) # indexes[self.pulls < 1] = float('+inf') # self.index[:] = indexes # --- AOCUCB
[docs]class AOCUCBH(OCUCBH): """ The Almost Optimally Confident UCB (OC-UCB) policy for bounded stochastic bandits. Initial version (horizon-dependent). - Reference: [Lattimore, 2015](https://arxiv.org/pdf/1507.07880.pdf) """
[docs] def __init__(self, nbArms, horizon=None, lower=0., amplitude=1.): super(AOCUCBH, self).__init__(nbArms, horizon=horizon, psi=2, alpha=2, lower=lower, amplitude=amplitude)
[docs] def __str__(self): return r"AOC-UCB-H($T={}$)".format(self.horizon)
[docs] def computeIndex(self, arm): r""" Compute the current index, at time t and after :math:`N_k(t)` pulls of arm k: .. math:: I_k(t) = \frac{X_k(t)}{N_k(t)} + \sqrt{\frac{2}{N_k(t)} \log(\frac{T}{N_k(t)})}. """ if self.pulls[arm] < 1: return float('+inf') else: return (self.rewards[arm] / self.pulls[arm]) + sqrt((2 / self.pulls[arm]) * log(self.horizon / self.pulls[arm]) )
# XXX Error : division by zero ? # def computeAllIndex(self): # """ Compute the current indexes for all arms, in a vectorized manner.""" # indexes = (self.rewards / self.pulls) + np.sqrt((2 / self.pulls) * np.log(self.horizon / self.pulls) ) # indexes[self.pulls < 1] = float('+inf') # self.index[:] = indexes