Source code for Policies.OCUCB

# -*- coding: utf-8 -*-
""" The Optimally Confident UCB (OC-UCB) policy for bounded stochastic bandits, with sub-Gaussian noise.

- Reference: [Lattimore, 2016](https://arxiv.org/pdf/1603.08661.pdf).
- There is also a horizon-dependent version, :class:`OCUCBH.OCUCBH`, from [Lattimore, 2015](https://arxiv.org/pdf/1507.07880.pdf).
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"

from math import exp, sqrt, log
import numpy as np
np.seterr(divide='ignore')  # XXX dangerous in general, controlled here!

try:
    from .UCB import UCB
except ImportError:
    from UCB import UCB

#: Default value for parameter :math:`\eta > 1` for OCUCB.
ETA = 2

#: Default value for parameter :math:`\rho \in (1/2, 1]` for OCUCB.
RHO = 1


[docs]class OCUCB(UCB):
    """ The Optimally Confident UCB (OC-UCB) policy for bounded stochastic bandits, with sub-Gaussian noise.

    - Reference: [Lattimore, 2016](https://arxiv.org/pdf/1603.08661.pdf).
    """

[docs]    def __init__(self, nbArms, eta=ETA, rho=RHO, lower=0., amplitude=1.):
        super(OCUCB, self).__init__(nbArms, lower=lower, amplitude=amplitude)
        assert eta > 1, "Error: parameter 'eta' for OCUCB algorithm has to be > 1."  # DEBUG
        self.eta = eta  #: Parameter :math:`\eta > 1`.
        assert 0.5 < rho <= 1, "Error: parameter 'rho' for OCUCB algorithm has to be in (1/2, 1]."  # DEBUG
        self.rho = rho  #: Parameter :math:`\rho \in (1/2, 1]`.

[docs]    def __str__(self):
        return r"OC-UCB($\eta={:.3g}$, $\rho={:.3g}$)".format(self.eta, self.rho)

[docs]    def _Bterm(self, k):
        r""" Compute the extra term :math:`B_k(t)` as follows:

        .. math::

            B_k(t) &= \max\Big\{ \exp(1), \log(t), t \log(t) / C_k(t) \Big\},\\
            \text{where}\; C_k(t) &= \sum_{j=1}^{K} \min\left\{ T_k(t), T_j(t)^{\rho} T_k(t)^{1 - \rho} \right\}
        """
        t = self.t
        T_ = self.pulls
        C_kt = sum(min(T_[k], (T_[j] ** self.rho) * (T_[k] ** (1. - self.rho))) for j in range(self.nbArms))
        return max([exp(1), log(t), t * log(t) / C_kt])

[docs]    def _Bterms(self):
        r""" Compute all the extra terms, :math:`B_k(t)` for each arm k, in a naive manner, not optimized to be vectorial, but it works."""
        return np.array([self._Bterm(k) for k in range(self.nbArms)])

[docs]    def computeIndex(self, arm):
        r""" Compute the current index, at time t and after :math:`N_k(t)` pulls of arm k:

        .. math:: I_k(t) = \frac{X_k(t)}{N_k(t)} + \sqrt{\frac{2 \eta \log(B_k(t))}{N_k(t)}}.

        - Where :math:`\eta` is a parameter of the algorithm,
        - And :math:`B_k(t)` is the additional term defined above.
        """
        if self.pulls[arm] < 1:
            return float('+inf')
        else:
            return (self.rewards[arm] / self.pulls[arm]) + sqrt(2 * self.eta * log(self._Bterm(arm)) / self.pulls[arm])