Source code for Policies.SlidingWindowUCB

# -*- coding: utf-8 -*-
r""" An experimental policy, using only a sliding window (of for instance :math:`\tau=1000` *steps*, not counting draws of each arms) instead of using the full-size history.

- Reference: [On Upper-Confidence Bound Policies for Non-Stationary Bandit Problems, by A.Garivier & E.Moulines, ALT 2011](https://arxiv.org/pdf/0805.3415.pdf)

- It uses an additional :math:`\mathcal{O}(\tau)` memory but do not cost anything else in terms of time complexity (the average is done with a sliding window, and costs :math:`\mathcal{O}(1)` at every time step).

.. warning:: This is very experimental!
.. note:: This is similar to :class:`SlidingWindowRestart.SWR_UCB` but slightly different: :class:`SlidingWindowRestart.SWR_UCB` uses a window of size :math:`T_0=100` to keep in memory the last 100 *draws* of *each* arm, and restart the index if the small history mean is too far away from the whole mean, while this :class:`SWUCB` uses a fixed-size window of size :math:`\tau=1000` to keep in memory the last 1000 *steps*.
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"

from math import log, sqrt
import numpy as np
np.seterr(divide='ignore')  # XXX dangerous in general, controlled here!

try:
    from .IndexPolicy import IndexPolicy
except ImportError:
    from IndexPolicy import IndexPolicy


#: Size of the sliding window.
TAU = 1000

#: Default value for the constant :math:`\alpha`.
ALPHA = 1.0


# --- Manually written

[docs]class SWUCB(IndexPolicy):
    r""" An experimental policy, using only a sliding window (of for instance :math:`\tau=1000` *steps*, not counting draws of each arms) instead of using the full-size history.
    """

[docs]    def __init__(self, nbArms,
                 tau=TAU, alpha=ALPHA,
                 *args, **kwargs):
        super(SWUCB, self).__init__(nbArms, *args, **kwargs)
        # New parameters
        assert 1 <= tau, "Error: parameter 'tau' for class SWUCB has to be >= 1, but was {}.".format(tau)  # DEBUG
        self.tau = int(tau)  #: Size :math:`\tau` of the sliding window.
        assert alpha > 0, "Error: parameter 'alpha' for class SWUCB has to be > 0, but was {}.".format(alpha)  # DEBUG
        self.alpha = alpha  #: Constant :math:`\alpha` in the square-root in the computation for the index.
        # Internal memory
        self.last_rewards = np.zeros(tau)  #: Keep in memory all the rewards obtained in the last :math:`\tau` steps.
        self.last_choices = np.full(tau, -1)  #: Keep in memory the times where each arm was last seen.

[docs]    def __str__(self):
        return r"SW-UCB($\tau={}${})".format(
            self.tau,
            ", $\alpha={:.3g}$".format(self.alpha) if self.alpha != ALPHA else "",
        )

[docs]    def getReward(self, arm, reward):
        """Give a reward: increase t, pulls, and update cumulated sum of rewards and update small history (sliding window) for that arm (normalized in [0, 1]).
        """
        now = self.t % self.tau
        # Get reward, normalized to [0, 1]
        reward = (reward - self.lower) / self.amplitude
        # We seen it one more time
        self.last_choices[now] = arm
        # Store it in place for the empirical average of that arm
        self.last_rewards[now] = reward
        self.t += 1

[docs]    def computeIndex(self, arm):
        r""" Compute the current index, at time :math:`t` and after :math:`N_{k,\tau}(t)` pulls of arm :math:`k`:

        .. math::

            I_k(t) &= \frac{X_{k,\tau}(t)}{N_{k,\tau}(t)} + c_{k,\tau}(t),\\
            \text{where}\;\; c_{k,\tau}(t) &:= \sqrt{\alpha \frac{\log(\min(t,\tau))}{N_{k,\tau}(t)}},\\
            \text{and}\;\; X_{k,\tau}(t) &:= \sum_{s=t-\tau+1}^{t} X_k(s) \mathbb{1}(A(t) = k),\\
            \text{and}\;\; N_{k,\tau}(t) &:= \sum_{s=t-\tau+1}^{t} \mathbb{1}(A(t) = k).
        """
        last_pulls_of_this_arm = np.count_nonzero(self.last_choices == arm)
        if last_pulls_of_this_arm < 1:
            return float('+inf')
        else:
            return (np.sum(self.last_rewards[self.last_choices == arm]) / last_pulls_of_this_arm) + sqrt((self.alpha * log(min(self.t, self.tau))) / last_pulls_of_this_arm)


# --- Horizon dependent version

[docs]class SWUCBPlus(SWUCB):
    r""" An experimental policy, using only a sliding window (of :math:`\tau` *steps*, not counting draws of each arms) instead of using the full-size history.

    - Uses :math:`\tau = 4 \sqrt{T \log(T)}` if the horizon :math:`T` is given, otherwise use the default value.
    """

[docs]    def __init__(self, nbArms, horizon=None,
                 *args, **kwargs):
        if horizon is not None:
            T = int(horizon)
            tau = int(4 * sqrt(T * log(T)))
        else:
            tau = TAU
        super(SWUCBPlus, self).__init__(nbArms, tau=tau, *args, **kwargs)
        # New parameter

[docs]    def __str__(self):
        return r"SW-UCB+($\tau={}$, $\alpha={:.3g}$)".format(self.tau, self.alpha)


# --- SW-klUCB

try:
    from .kullback import klucbBern
except (ImportError, SystemError):
    from kullback import klucbBern

#: Default value for the constant c used in the computation of KL-UCB index.
constant_c = 1.  #: default value, as it was in pymaBandits v1.0
# c = 1.  #: as suggested in the Theorem 1 in https://arxiv.org/pdf/1102.2490.pdf


#: Default value for the tolerance for computing numerical approximations of the kl-UCB indexes.
tolerance = 1e-4


[docs]class SWklUCB(SWUCB):
    r""" An experimental policy, using only a sliding window (of :math:`\tau` *steps*, not counting draws of each arms) instead of using the full-size history, and using klUCB (see :class:`Policy.klUCB`) indexes instead of UCB.
    """

[docs]    def __init__(self, nbArms, tau=TAU, klucb=klucbBern, *args, **kwargs):
        super(SWklUCB, self).__init__(nbArms, tau=tau, *args, **kwargs)
        self.klucb = klucb  #: kl function to use

[docs]    def __str__(self):
        name = self.klucb.__name__[5:]
        if name == "Bern": name = ""
        if name != "": name = "({})".format(name)
        return r"SW-klUCB{}($\tau={}$)".format(name, self.tau)

[docs]    def computeIndex(self, arm):
        r""" Compute the current index, at time t and after :math:`N_k(t)` pulls of arm k:

        .. math::

            \hat{\mu'}_k(t) &= \frac{X_{k,\tau}(t)}{N_{k,\tau}(t)} , \\
            U_k(t) &= \sup\limits_{q \in [a, b]} \left\{ q : \mathrm{kl}(\hat{\mu'}_k(t), q) \leq \frac{c \log(\min(t,\tau))}{N_{k,\tau}(t)} \right\},\\
            I_k(t) &= U_k(t),\\
            \text{where}\;\; X_{k,\tau}(t) &:= \sum_{s=t-\tau+1}^{t} X_k(s) \mathbb{1}(A(t) = k),\\
            \text{and}\;\; N_{k,\tau}(t) &:= \sum_{s=t-\tau+1}^{t} \mathbb{1}(A(t) = k).

        If rewards are in :math:`[a, b]` (default to :math:`[0, 1]`) and :math:`\mathrm{kl}(x, y)` is the Kullback-Leibler divergence between two distributions of means x and y (see :mod:`Arms.kullback`),
        and c is the parameter (default to 1).
        """
        last_pulls_of_this_arm = np.count_nonzero(self.last_choices == arm)
        if last_pulls_of_this_arm < 1:
            return float('+inf')
        else:
            mean = np.sum(self.last_rewards[self.last_choices == arm]) / last_pulls_of_this_arm
            level = constant_c * log(min(self.t, self.tau)) / last_pulls_of_this_arm
            return self.klucb(mean, level, tolerance)


[docs]class SWklUCBPlus(SWklUCB, SWUCBPlus):
    r""" An experimental policy, using only a sliding window (of :math:`\tau` *steps*, not counting draws of each arms) instead of using the full-size history, and using klUCB (see :class:`Policy.klUCB`) indexes instead of UCB.

    - Uses :math:`\tau = 4 \sqrt{T \log(T)}` if the horizon :math:`T` is given, otherwise use the default value.
    """

[docs]    def __str__(self):
        name = self.klucb.__name__[5:]
        if name == "Bern": name = ""
        if name != "": name = "({})".format(name)
        return r"SW-klUCB{}+($\tau={}$)".format(name, self.tau)