Source code for Policies.RAWUCB

# -*- coding: utf-8 -*-
"""
author: Julien Seznec

Rotting Adaptive Window Upper Confidence Bounds for rotting bandits.

Reference : [Seznec et al.,  2019b]
A single algorithm for both rested and restless rotting bandits (WIP)
Julien Seznec, Pierre Ménard, Alessandro Lazaric, Michal Valko
"""


from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Julien Seznec"
__version__ = "0.1"

import numpy as np
np.seterr(divide='ignore')  # XXX dangerous in general, controlled here!

try:
    from .BasePolicy import BasePolicy
    from .IndexPolicy import IndexPolicy
    from .FEWA import FEWA, EFF_FEWA
    from .kullback import klucbBern
except ImportError:
    from BasePolicy import BasePolicy
    from IndexPolicy import IndexPolicy
    from FEWA import FEWA, EFF_FEWA
    from kullback import klucbBern



[docs]class EFF_RAWUCB(EFF_FEWA): """ Efficient Rotting Adaptive Window Upper Confidence Bound (RAW-UCB) [Seznec et al., 2019b, WIP] Efficient trick described in [Seznec et al., 2019a, https://arxiv.org/abs/1811.11043] (m=2) and [Seznec et al., 2019b, WIP] (m<=2) We use the confidence level :math:`\delta_t = \frac{1}{t^\alpha}`. """
[docs] def choice(self): not_selected = np.where(self.pulls == 0)[0] if len(not_selected): return not_selected[0] self.ucb = self._compute_ucb() return np.nanmin(self.ucb, axis=1).argmax()
[docs] def _compute_ucb(self): return (self.statistics[0, :, :] / self.windows + self.outlogconst * np.sqrt(np.log(self._inlog())))
[docs] def _append_thresholds(self, w): # FEWA use two confidence bounds. Hence, the outlogconst is twice smaller for RAWUCB return np.sqrt(2 * self.subgaussian ** 2 / w)
[docs] def __str__(self): return r"EFF_RAW-UCB($\alpha={:.3g}, \, m={:.3g}$)".format(self.alpha, self.grid)
[docs]class EFF_RAWklUCB(EFF_RAWUCB): """ Use KL-confidence bound instead of close formula approximation. Experimental work : Much slower (!!) because we compute many UCB at each round per arm) """
[docs] def __init__(self, nbArms, subgaussian=1, alpha=1, klucb=klucbBern, tol=1e-4, m=2): super(EFF_RAWklUCB, self).__init__(nbArms=nbArms, subgaussian=subgaussian, alpha=alpha, m=m) self.c = alpha self.klucb_vec = np.vectorize(klucb, excluded=['precision']) self.tolerance = tol
[docs] def choice(self): not_selected = np.where(self.pulls == 0)[0] if len(not_selected): return not_selected[0] self.ucb = self.klucb_vec(self.statistics[0, :, :] / self.windows, self.c * np.log(self.t + 1) / self.windows, precision=self.tolerance) return np.argmax(np.nanmin(self.ucb, axis=1))
[docs] def __str__(self): return r"EFF_RAW-klUCB($c={:.3g}, \, m={:.3g}$)".format(self.alpha, self.grid)
[docs]class RAWUCB(EFF_RAWUCB): """ Rotting Adaptive Window Upper Confidence Bound (RAW-UCB) [Seznec et al., 2019b, WIP] We use the confidence level :math:`\delta_t = \frac{1}{t^\alpha}`. """
[docs] def __init__(self, nbArms, subgaussian=1, alpha=1): super(RAWUCB, self).__init__(nbArms=nbArms, subgaussian=subgaussian, alpha=alpha, m=1 + 1e-15)
[docs] def __str__(self): return r"RAW-UCB($\alpha={:.3g}$)".format(self.alpha)
[docs]class EFF_RAWUCB_asymptotic(EFF_RAWUCB): """ Efficient Rotting Adaptive Window Upper Confidence Bound (RAW-UCB) [Seznec et al., 2019b, WIP] We use the confidence level :math:`\delta_t = \frac{1}{t(1+log(t)^\Beta)}`. :math:`\Beta=2` corresponds to an asymptotic optimal tuning of UCB for stationnary bandits (Bandit Algorithms, Lattimore and Szepesvari, Chapter 7, https://tor-lattimore.com/downloads/book/book.pdf) """
[docs] def __init__(self, nbArms, subgaussian=1, beta=2, m =2): super(EFF_RAWUCB_asymptotic, self).__init__(nbArms=nbArms, subgaussian=subgaussian, alpha=1, m=m) self.beta = beta
[docs] def __str__(self): print(self.beta) return r"EFF-RAW-UCB($\delta_t=\frac{1}{t(1+log(t)^{:.3g}}$)".format(self.beta)
[docs] def _inlog(self): return self.t * (1 + np.log(self.t) ** self.beta)
if __name__ == "__main__": # Code for debugging purposes. HORIZON = 50000 sigma = 1 policy = EFF_RAWUCB(5, subgaussian=sigma, alpha=1., m=1.1) reward = {0: 0., 1: 0.2, 2: 0.4, 3: 0.6, 4: 0.8} for t in range(HORIZON): choice = policy.choice() policy.getReward(choice, reward[choice]) print(policy.statistics[0, :, :]) print(policy.statistics.shape) print(policy.windows) print(policy.pulls)