Source code for Policies.ApproximatedFHGittins

# -*- coding: utf-8 -*-
r""" The approximated Finite-Horizon Gittins index policy for bounded bandits.

- This is not the computationally costly Gittins index, but a simple approximation, using the knowledge of the horizon T.
- Reference: [Lattimore - COLT, 2016](http://www.jmlr.org/proceedings/papers/v49/lattimore16.pdf), and [his COLT presentation](https://youtu.be/p8AwKiudhZ4?t=276)
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"

import numpy as np
np.seterr(divide='ignore')  # XXX dangerous in general, controlled here!

try:
    from .IndexPolicy import IndexPolicy
except ImportError:
    from IndexPolicy import IndexPolicy

#: Default value for the parameter :math:`\alpha > 0` for ApproximatedFHGittins.
ALPHA = 0.125
ALPHA = 0.5

#: Default value for the parameter :math:`\tau \geq 1` that is used to artificially increase the horizon, from :math:`T` to :math`\tau T`.
DISTORTION_HORIZON = 1.01


[docs]class ApproximatedFHGittins(IndexPolicy):
    r""" The approximated Finite-Horizon Gittins index policy for bounded bandits.

    - This is not the computationally costly Gittins index, but a simple approximation, using the knowledge of the horizon T.
    - Reference: [Lattimore - COLT, 2016](http://www.jmlr.org/proceedings/papers/v49/lattimore16.pdf), and [his COLT presentation](https://youtu.be/p8AwKiudhZ4?t=276)
    """

[docs]    def __init__(self, nbArms, horizon=None,
                 alpha=ALPHA, distortion_horizon=DISTORTION_HORIZON,
                 lower=0., amplitude=1.):
        super(ApproximatedFHGittins, self).__init__(nbArms, lower=lower, amplitude=amplitude)
        assert alpha > 0, "Error: parameter 'alpha' for ApproximatedFHGittins should be > 0."  # DEBUG
        self.alpha = alpha  #: Parameter :math:`\alpha > 0`.
        assert distortion_horizon >= 1, "Error: parameter 'distortion_horizon' for ApproximatedFHGittins should be >= 1."  # DEBUG
        self.distortion_horizon = distortion_horizon  #: Parameter :math:`\tau > 0`.
        self.horizon = int(horizon)  #: Parameter :math:`T` = known horizon of the experiment.

[docs]    def __str__(self):
        if self.alpha == ALPHA:
            return r"ApprFHG($T={}$)".format(self.horizon)
        else:
            return r"ApprFHG($T={}$, $\alpha={:.3g}$)".format(self.horizon, self.alpha)

    @property
    def m(self):
        r""":math:`m = T - t + 1` is the number of steps to be played until end of the game.

        .. note:: The article does not explain how to deal with unknown horizon, but eventually if :math:`T` is wrong, this `m` becomes negative. Empirically, I force it to be :math:`\geq 1`, to not mess up with the :math:`\log(m)` used below, by using :math:`\tau T` instead of :math:`T` (e.g., :math:`\tau = 1.01` is enough to not ruin the performance in the last steps of the experiment).
        """
        return max((self.distortion_horizon * self.horizon) - self.t + 1, 1)
        # return max(self.horizon - self.t + 1, 1)

    # --- Computation

[docs]    def computeIndex(self, arm):
        r""" Compute the current index, at time t and after :math:`N_k(t)` pulls of arm k:

        .. math::

           I_k(t) &= \frac{X_k(t)}{N_k(t)} + \sqrt{\frac{2 \alpha}{N_k(t)} \log\left( \frac{m}{N_k(t) \log^{1/2}\left( \frac{m}{N_k(t)} \right)} \right)}, \\
           \text{where}\;\; & m = T - t + 1.


        .. note:: This :math:`\log^{1/2}(\dots) = \sqrt(\log(\dots)))` term can be *undefined*, as soon as :math:`m < N_k(t)`, so empirically, :math:`\sqrt(\max(0, \log(\dots))` is used instead, or a larger horizon can be used to make :math:`m` artificially larger (e.g., :math:`T' = 1.1 T`).

        """
        if self.pulls[arm] < 1:
            return float('+inf')
        else:
            m_by_Nk = float(self.m) / self.pulls[arm]
            return (self.rewards[arm] / self.pulls[arm]) + np.sqrt((2. * self.alpha) / self.pulls[arm] * np.log(m_by_Nk / np.sqrt(np.log(m_by_Nk))))

[docs]    def computeAllIndex(self):
        """ Compute the current indexes for all arms, in a vectorized manner."""
        m_by_Nk = float(self.m) / self.pulls
        indexes = (self.rewards / self.pulls) + np.sqrt((2. * self.alpha) / self.pulls * np.log(m_by_Nk / np.sqrt(np.maximum(0, np.log(m_by_Nk)))))
        indexes[self.pulls < 1] = float('+inf')
        self.index[:] = indexes