Source code for Policies.SWHash_UCB

# -*- coding: utf-8 -*-
r""" The SW-UCB# policy for non-stationary bandits, from [["On Abruptly-Changing and Slowly-Varying Multiarmed Bandit Problems", by Lai Wei, Vaibhav Srivastava, 2018, arXiv:1802.08380]](https://arxiv.org/pdf/1802.08380)

- Instead of being restricted to UCB, it runs on top of a simple policy, e.g., :class:`UCB`, and :func:`SWHash_IndexPolicy` is a generic policy using any simple policy with this "sliding window" trick:

    >>> policy = SWHash_IndexPolicy(nbArms, UCB, tau=100, threshold=0.1)
    >>> # use policy as usual, with policy.startGame(), r = policy.choice(), policy.getReward(arm, r)

- It uses an additional non-fixed :math:`\mathcal{O}(\tau(t,\alpha))` memory and an extra time complexity.

.. warning:: This implementation is still experimental!
.. warning:: It can only work on basic index policy based on empirical averages (and an exploration bias), like :class:`UCB`, and cannot work on any Bayesian policy (for which we would have to remember all previous observations in order to reset the history with a small history)!
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"


import numpy as np

try:
    from .BaseWrapperPolicy import BaseWrapperPolicy
    from .UCBalpha import UCBalpha as DefaultPolicy
except ImportError:
    from BaseWrapperPolicy import BaseWrapperPolicy
    from UCBalpha import UCBalpha as DefaultPolicy


# --- Parameter alpha

[docs]def alpha_for_abruptly_changing_env(nu=0.5): r"""For abruptly-changing environement, if the number of break-points is :math:`\Upsilon_T = \mathcal{O}(T^{\nu})`, then the SW-UCB# algorithm chooses :math:`\alpha = \frac{1-\nu}{2}`.""" return (1 - nu) / 2
[docs]def alpha_for_slowly_varying_env(kappa=1): r"""For slowly-varying environement, if the change in mean reward between two time steps is bounded by :math:`\varepsilon_T = \mathcal{O}(T^{-\kappa})`, then the SW-UCB# algorithm chooses :math:`\alpha = \min{1, \frac{3\kappa}{4}}`.""" return min(1, 3 * kappa / 4)
# --- Parameters rho, b etc #: Default parameter for :math:`\alpha`. ALPHA = 0.5 #: Default parameter for :math:`\lambda`. LAMBDA = 1
[docs]def tau_t_alpha(t, alpha=ALPHA, lmbda=LAMBDA): r""" Compute :math:`\tau(t,\alpha) = \min(\lceil \lambda t^{\alpha} \rceil, t)`.""" return int(min(np.ceil(lmbda * (t**alpha)), t))
# --- Class SWHash_IndexPolicy
[docs]class SWHash_IndexPolicy(BaseWrapperPolicy): r""" The SW-UCB# policy for non-stationary bandits, from [["On Abruptly-Changing and Slowly-Varying Multiarmed Bandit Problems", by Lai Wei, Vaibhav Srivastava, 2018, arXiv:1802.08380]](https://arxiv.org/pdf/1802.08380) """
[docs] def __init__(self, nbArms, policy=DefaultPolicy, alpha=ALPHA, lmbda=LAMBDA, lower=0., amplitude=1., *args, **kwargs ): alpha = 1 + alpha #: The parameter :math:`\alpha` for the UCB indexes super(SWHash_IndexPolicy, self).__init__(nbArms, policy=policy, *args, **kwargs) self.alpha = alpha #: The parameter :math:`\alpha` for the SW-UCB# algorithm (see article for reference). self.lmbda = lmbda #: The parameter :math:`\lambda` for the SW-UCB# algorithm (see article for reference). # Internal memory self.all_rewards = [] #: Keep in memory all the rewards obtained in the all the past steps (the size of the window is evolving!). self.all_pulls = [] #: Keep in memory all the pulls obtained in the all the past steps (the size of the window is evolving!). Start with -1 (never seen).
[docs] def __str__(self): return r"SW-UCB#($\lambda={:.3g}$, $\alpha={:.3g}$)".format(self.lmbda, self.alpha - 1)
@property def tau(self): r""" The current :math:`\tau(t,\alpha)`.""" return tau_t_alpha(self.t, alpha=self.alpha, lmbda=self.lmbda)
[docs] def startGame(self, createNewPolicy=True): """ Initialize the policy for a new game.""" super(SWHash_IndexPolicy, self).startGame(createNewPolicy=createNewPolicy) self.all_rewards = [] self.all_pulls = []
[docs] def getReward(self, arm, reward): """Give a reward: increase t, pulls, and update cumulated sum of rewards and update total history and partial history of all arms (normalized in [0, 1]). .. warning:: So far this is badly implemented and the algorithm is VERY slow: it has to store all the past, as the window-length is increasing when t increases. """ self.t += 1 self.policy.t += 1 # Get reward, normalize it reward = (reward - self.lower) / self.amplitude self.pulls[arm] += 1 self.rewards[arm] += reward # We seen it one more time at this time step? self.all_pulls.append(arm) self.all_rewards.append(reward) # compute the size of the current window current_tau = self.tau # print("For {} at time t = {}, current tau = {}, a reward = {} was seen from arm {}...".format(self, self.t, current_tau, reward, arm)) # DEBUG # it's highly innefficient but who cares partial_all_pulls = self.all_pulls[-current_tau:] partial_all_rewards = self.all_rewards[-current_tau:] # Compute fake pulls and fake average rewards for otherArm in range(self.nbArms): # Store it in place for the empirical average of that arm these_rewards = [partial_all_rewards[i] for i, p in enumerate(partial_all_pulls) if p == otherArm] self.policy.rewards[otherArm] = np.sum(these_rewards) self.policy.pulls[otherArm] = len(these_rewards)
# # print(" and self.pulls = {} and self.rewards = {}".format(self.pulls, self.rewards)) # DEBUG