Source code for Policies.UCBdagger

# -*- coding: utf-8 -*-
r""" The UCB-dagger (:math:`\mathrm{UCB}{\dagger}`, UCB†) policy, a significant improvement over UCB by auto-tuning the confidence level.

- Reference: [[Auto-tuning the Confidence Level for Optimistic Bandit Strategies, Lattimore, unpublished, 2017]](http://tor-lattimore.com/)
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"

import numpy as np
np.seterr(divide='ignore')  # XXX dangerous in general, controlled here!
_e = np.e

try:
    from .IndexPolicy import IndexPolicy
except ImportError:
    from IndexPolicy import IndexPolicy

#: Default value for the parameter :math:`\alpha > 0` for UCBdagger.
ALPHA = 1


[docs]def log_bar(x):
    r"""The function defined as :math:`\mathrm{l\overline{og}}` by Lattimore:

    .. math:: \mathrm{l\overline{og}}(x) := \log\left((x+e)\sqrt{\log(x+e)}\right)

    Some values:

    >>> for x in np.logspace(0, 7, 8):
    ...     print("x = {:<5.3g} gives log_bar(x) = {:<5.3g}".format(x, log_bar(x)))
    x = 1     gives log_bar(x) = 1.45
    x = 10    gives log_bar(x) = 3.01
    x = 100   gives log_bar(x) = 5.4
    x = 1e+03 gives log_bar(x) = 7.88
    x = 1e+04 gives log_bar(x) = 10.3
    x = 1e+05 gives log_bar(x) = 12.7
    x = 1e+06 gives log_bar(x) = 15.1
    x = 1e+07 gives log_bar(x) = 17.5


    Illustration:

    >>> import matplotlib.pyplot as plt
    >>> X = np.linspace(0, 1000, 2000)
    >>> Y = log_bar(X)
    >>> plt.plot(X, Y)
    >>> plt.title(r"The $\mathrm{l\overline{og}}$ function")
    >>> plt.show()
    """
    return np.log((x + _e) * np.sqrt(np.log(x + _e)))


[docs]def Ki_function(pulls, i):
    r"""Compute the :math:`K_i(t)` index as defined in the article, for one arm i."""
    if pulls[i] <= 0:
        return len(pulls)
    else:
        return np.sum(np.minimum(1., np.sqrt(pulls / pulls[i])))


[docs]def Ki_vectorized(pulls):
    r"""Compute the :math:`K_i(t)` index as defined in the article, for all arms (in a vectorized manner).

    .. warning:: I didn't find a fast vectorized formula, so don't use this one.
    """
    return np.array([Ki_function(pulls, i) for i in range(len(pulls))])
    # ratios = np.zeros_like(pulls)
    # for i, p in enumerate(pulls):
    #     ratios[i] = np.sum(np.minimum(1., np.sqrt(pulls / p)))
    # return ratios


# --- The UCBdagger policy

[docs]class UCBdagger(IndexPolicy):
    r""" The UCB-dagger (:math:`\mathrm{UCB}{\dagger}`, UCB†) policy, a significant improvement over UCB by auto-tuning the confidence level.

    - Reference: [[Auto-tuning the Confidence Level for Optimistic Bandit Strategies, Lattimore, unpublished, 2017]](http://downloads.tor-lattimore.com/papers/XXX)
    """

[docs]    def __init__(self, nbArms, horizon=None,
                 alpha=ALPHA,
                 lower=0., amplitude=1.):
        super(UCBdagger, self).__init__(nbArms, lower=lower, amplitude=amplitude)
        assert alpha > 0, "Error: parameter 'alpha' for UCBdagger should be > 0."  # DEBUG
        self.alpha = alpha  #: Parameter :math:`\alpha > 0`.
        assert horizon > 0, "Error: parameter 'horizon' for UCBdagger should be > 0."  # DEBUG
        self.horizon = int(horizon)  #: Parameter :math:`T > 0`.
        # # Internal memory
        # self.indexes_K = nbArms * np.ones(nbArms)  #: Keep in memory the indexes :math:`K_i(t)`, to speed up their computation from :math:`\mathcal{O}(K^2)` to :math:`\mathcal{O}(K)` (where :math:`K` is the number of arm).

[docs]    def __str__(self):
        if self.alpha == ALPHA:
            return r"UCB$\dagger$($T={}$)".format(self.horizon)
        else:
            return r"UCB$\dagger$($T={}$, $\alpha={:.3g}$)".format(self.horizon, self.alpha)

[docs]    def getReward(self, arm, reward):
        """ Give a reward: increase t, pulls, and update cumulated sum of rewards for that arm (normalized in [0, 1])."""
        # Do this in less than O(K) times?
        # Don't see the point, maintaining a list of arm sorted by their N_k(t) actually takes more time than just looping

        # # XXX old fashion
        # # for other_arm in range(self.nbArms):
        # #     if self.pulls[other_arm] > self.pulls[arm]:

        # # Cf. Appendix D. Computation of UCB†
        # for other_arm in np.where(self.pulls > self.pulls[arm])[0]:
        #     self.indexes_K[other_arm] += np.sqrt((self.pulls[arm] + 1) / self.pulls[other_arm]) - np.sqrt(self.pulls[arm] / self.pulls[other_arm])

        # # Just debugging
        # assert np.all(self.indexes_K >= 0), "Error: an index K was found < 0..."  # DEBUG
        # _K = Ki_vectorized(self.pulls)
        # for other_arm in range(self.nbArms):
        #     # assert self.indexes_K[other_arm] == _K[other_arm], "Error: an index K = {:.3g} was found wrong for arm {} (the reference value was {:.3g})...".format(self.indexes_K[other_arm], other_arm, _K[other_arm])  # DEBUG
        #     if not (self.indexes_K[other_arm] == _K[other_arm]):
        #         print("Error: an index K = {:.3g} was found wrong for arm {} (the reference value was {:.3g})...".format(self.indexes_K[other_arm], other_arm, _K[other_arm]))  # DEBUG

        # Now actually store the reward and pull
        super(UCBdagger, self).getReward(arm, reward)

    # --- Computation

[docs]    def computeIndex(self, arm):
        r""" Compute the current index, at time t and after :math:`N_k(t)` pulls of arm k:

        .. math::

            I_k(t) &= \frac{X_k(t)}{N_k(t)} + \sqrt{\frac{2 \alpha}{N_k(t)} \mathrm{l}\overline{\mathrm{og}}\left( \frac{T}{H_k(t)} \right)}, \\
            \text{where}\;\; & H_k(t) := N_k(t) K_k(t) \\
            \text{and}\;\; & K_k(t) := \sum_{j=1}^{K} \min(1, \sqrt{\frac{T_j(t)}{T_i(t)}}).
        """
        if self.pulls[arm] < 1:
            return float('+inf')
        else:
            # XXX Use the internal indexes (faster?) if they work
            # H_arm = self.pulls[arm] * self.indexes_K[arm]
            H_arm = self.pulls[arm] * Ki_function(self.pulls, arm)
            return (self.rewards[arm] / self.pulls[arm]) + np.sqrt(((2. * self.alpha) / self.pulls[arm]) * log_bar(self.horizon / H_arm))

    # def computeAllIndex(self):
    #     """ Compute the current indexes for all arms, in a vectorized manner."""
    #     # H_arms = self.pulls * Ki_vectorized(self.pulls)
    #     H_arms = self.pulls * self.indexes_K
    #     indexes (self.rewards / self.pulls) + np.sqrt(((2. * self.alpha) / self.pulls) * log_bar(self.horizon / H_arms))
    #     indexes[self.pulls < 1] = float('+inf')
    #     self.index[:] = indexes