Source code for Policies.IMED

# -*- coding: utf-8 -*-
""" The IMED policy of [Honda & Takemura, JMLR 2015].

- Reference: [["Non-asymptotic analysis of a new bandit algorithm for semi-bounded rewards", J. Honda and A. Takemura, JMLR, 2015](http://jmlr.csail.mit.edu/papers/volume16/honda15a/honda15a.pdf)].
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"

import numpy as np
np.seterr(divide='ignore')  # XXX dangerous in general, controlled here!

try:
    from .DMED import DMED
    from .usenumba import jit
    from .kullback import klBern
except ImportError:
    from DMED import DMED
    from usenumba import jit
    from kullback import klBern


# --- Utilitary functions Dinf

from scipy.optimize import minimize_scalar
from warnings import filterwarnings
filterwarnings("ignore", message="Method 'bounded' does not support relative tolerance in x; defaulting to absolute tolerance", category=RuntimeWarning)


[docs]def Dinf(x=None, mu=None, kl=klBern,
        lowerbound=0, upperbound=1,
        precision=1e-6, max_iterations=50
    ):
    r""" The generic Dinf index computation.

    - ``x``: value of the cum reward,
    - ``mu``: upperbound on the mean ``y``,
    - ``kl``: the KL divergence to be used (:func:`klBern`, :func:`klGauss`, etc),
    - ``lowerbound``, ``upperbound=1``: the known bound of the values ``y`` and ``x``,
    - ``precision=1e-6``: the threshold from where to stop the research,
    - ``max_iterations``: max number of iterations of the loop (safer to bound it to reduce time complexity).

    .. math::

        D_{\inf}(x, d) \simeq \inf_{\max(\mu, \mathrm{lowerbound}) \leq y \leq \mathrm{upperbound}} \mathrm{kl}(x, y).

    .. note:: It uses a call the :func:`scipy.optimize.minimize_scalar`. If this fails, it uses a **bisection search**, and one call to ``kl`` for each step of the bisection search.
    """
    # lower and upper bounds
    l = max(lowerbound, mu)
    u = upperbound
    def f(i): return kl(x, i)  # objective function!
    res = minimize_scalar(f, bounds=[l, u],
        method="bounded",
        tol=precision,
        options={"maxiter": max_iterations, "disp": False}
    )
    if hasattr(res, "x"):
        return res.x
    else:
        print("Warning: the call to scipy.optimize.minimize_scalar failed, using hand-written bisection search instead...")  # DEBUG
    # start in the middle
    value = (l + u) / 2.
    current_kl = kl(x, value)
    _count_iteration = 0
    while _count_iteration < max_iterations and u - value > precision:
        _count_iteration += 1
        # try to see right
        value = (l + u) / 2.
        new_kl = kl(x, value)
        if new_kl > current_kl:  # need to go left
            u = value
        else:  # need to go right
            current_kl = new_kl
            l = value
    return current_kl


# --- IMED

[docs]class IMED(DMED):
    """ The IMED policy of [Honda & Takemura, JMLR 2015].

    - Reference: [["Non-asymptotic analysis of a new bandit algorithm for semi-bounded rewards", J. Honda and A. Takemura, JMLR, 2015](http://jmlr.csail.mit.edu/papers/volume16/honda15a/honda15a.pdf)].
    """

[docs]    def __init__(self, nbArms, tolerance=1e-4, kl=klBern, lower=0., amplitude=1.):
        super(IMED, self).__init__(nbArms, tolerance=tolerance, kl=kl, lower=lower, amplitude=amplitude)

[docs]    def __str__(self):
        return r"IMED({})".format(self.kl.__name__[2:])

[docs]    def one_Dinf(self, x, mu):
        r""" Compute the :math:`D_{\inf}` solution, for one value of ``x``, and one value for ``mu``."""
        return Dinf(x=x, mu=mu, kl=self.kl, lowerbound=self.lower, upperbound=self.lower + self.amplitude, precision=self.tolerance)

    # XXX Use this hack to vectorize one_Dinf ?
    # Dinf = np.vectorize(one_Dinf) ?
[docs]    def Dinf(self, xs, mu):
        r""" Compute the :math:`D_{\inf}` solution, for a vector of value of ``xs``, and one value for ``mu``."""
        return np.array([ self.one_Dinf(x, mu) for x in xs ])

[docs]    def choice(self):
        r""" Choose an arm with **minimal** index (uniformly at random):

        .. math:: A(t) \sim U(\arg\min_{1 \leq k \leq K} I_k(t)).

        Where the indexes are:

        .. math:: I_k(t) = N_k(t) D_{\inf}(\hat{\mu_{k}}(t), \max_{k'} \hat{\mu_{k'}}(t)) + \log(N_k(t)).
        """
        empiricalMeans = self.rewards / self.pulls
        bestEmpiricalMean = np.max(empiricalMeans)
        values_Dinf = self.Dinf(empiricalMeans, bestEmpiricalMean)

        # now compute the indexes
        indexes_to_minimize = self.pulls * values_Dinf + np.log(self.pulls)
        indexes_to_minimize[self.pulls < 1] = float('-inf')

        # then do as IndexPolicy but with a min instead
        # try:
        return np.random.choice(np.nonzero(indexes_to_minimize == np.min(indexes_to_minimize))[0])
        # except ValueError:
        #     if not np.all(np.isnan(indexes_to_minimize)):
        #         raise ValueError("Error: unknown error in IMED.choice(): the indexes were {} but couldn't be used to select an arm.".format(indexes_to_minimize))
        #     return np.random.randint(self.nbArms)