Source code for Policies.OCUCB

# -*- coding: utf-8 -*-
""" The Optimally Confident UCB (OC-UCB) policy for bounded stochastic bandits, with sub-Gaussian noise.

- Reference: [Lattimore, 2016](https://arxiv.org/pdf/1603.08661.pdf).
- There is also a horizon-dependent version, :class:`OCUCBH.OCUCBH`, from [Lattimore, 2015](https://arxiv.org/pdf/1507.07880.pdf).
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"

from math import exp, sqrt, log
import numpy as np
np.seterr(divide='ignore')  # XXX dangerous in general, controlled here!

try:
    from .UCB import UCB
except ImportError:
    from UCB import UCB

#: Default value for parameter :math:`\eta > 1` for OCUCB.
ETA = 2

#: Default value for parameter :math:`\rho \in (1/2, 1]` for OCUCB.
RHO = 1


[docs]class OCUCB(UCB): """ The Optimally Confident UCB (OC-UCB) policy for bounded stochastic bandits, with sub-Gaussian noise. - Reference: [Lattimore, 2016](https://arxiv.org/pdf/1603.08661.pdf). """
[docs] def __init__(self, nbArms, eta=ETA, rho=RHO, lower=0., amplitude=1.): super(OCUCB, self).__init__(nbArms, lower=lower, amplitude=amplitude) assert eta > 1, "Error: parameter 'eta' for OCUCB algorithm has to be > 1." # DEBUG self.eta = eta #: Parameter :math:`\eta > 1`. assert 0.5 < rho <= 1, "Error: parameter 'rho' for OCUCB algorithm has to be in (1/2, 1]." # DEBUG self.rho = rho #: Parameter :math:`\rho \in (1/2, 1]`.
[docs] def __str__(self): return r"OC-UCB($\eta={:.3g}$, $\rho={:.3g}$)".format(self.eta, self.rho)
[docs] def _Bterm(self, k): r""" Compute the extra term :math:`B_k(t)` as follows: .. math:: B_k(t) &= \max\Big\{ \exp(1), \log(t), t \log(t) / C_k(t) \Big\},\\ \text{where}\; C_k(t) &= \sum_{j=1}^{K} \min\left\{ T_k(t), T_j(t)^{\rho} T_k(t)^{1 - \rho} \right\} """ t = self.t T_ = self.pulls C_kt = sum(min(T_[k], (T_[j] ** self.rho) * (T_[k] ** (1. - self.rho))) for j in range(self.nbArms)) return max([exp(1), log(t), t * log(t) / C_kt])
[docs] def _Bterms(self): r""" Compute all the extra terms, :math:`B_k(t)` for each arm k, in a naive manner, not optimized to be vectorial, but it works.""" return np.array([self._Bterm(k) for k in range(self.nbArms)])
[docs] def computeIndex(self, arm): r""" Compute the current index, at time t and after :math:`N_k(t)` pulls of arm k: .. math:: I_k(t) = \frac{X_k(t)}{N_k(t)} + \sqrt{\frac{2 \eta \log(B_k(t))}{N_k(t)}}. - Where :math:`\eta` is a parameter of the algorithm, - And :math:`B_k(t)` is the additional term defined above. """ if self.pulls[arm] < 1: return float('+inf') else: return (self.rewards[arm] / self.pulls[arm]) + sqrt(2 * self.eta * log(self._Bterm(arm)) / self.pulls[arm])