Source code for Policies.TsallisInf

# -*- coding: utf-8 -*-
""" The 1/2-Tsallis-Inf policy for bounded bandit, (order) optimal for stochastic and adversarial bandits.

- Reference: [["An Optimal Algorithm for Stochastic and Adversarial Bandits", Julian Zimmert, Yevgeny Seldin, 2018, arXiv:1807.07623]](https://arxiv.org/abs/1807.07623)
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"

from math import sqrt
import numpy as np
import numpy.random as rn
import scipy.optimize as opt
try:
    from .Exp3 import Exp3
except ImportError:
    from Exp3 import Exp3

#: Default value for :math:`\alpha` the parameter of the Tsallis entropy.
#: We focus on the 1/2-Tsallis algorithm, ie, with :math:`\alpha=\frac{1}{2}`.
ALPHA = 0.5


[docs]class TsallisInf(Exp3): """ The 1/2-Tsallis-Inf policy for bounded bandit, (order) optimal for stochastic and adversarial bandits. - Reference: [["An Optimal Algorithm for Stochastic and Adversarial Bandits", Julian Zimmert, Yevgeny Seldin, 2018, arXiv:1807.07623]](https://arxiv.org/abs/1807.07623) """
[docs] def __init__(self, nbArms, alpha=ALPHA, lower=0., amplitude=1.): super(TsallisInf, self).__init__(nbArms, unbiased=True, lower=lower, amplitude=amplitude) self.alpha = alpha #: Store the constant :math:`\alpha` used by the Online-Mirror-Descent step using :math:`\alpha` Tsallis entropy. self.inverse_exponent = 1.0 / (self.alpha - 1.0) #: Store :math:`\frac{1}{\alpha-1}` to only compute it once. self.cumulative_losses = np.zeros(nbArms) #: Keep in memory the vector :math:`\hat{L}_t` of cumulative (unbiased estimates) of losses.
[docs] def __str__(self): return r"Tsallis-Inf($\alpha={:.3g}$)".format(self.alpha)
@property def eta(self): r""" Decreasing learning rate, :math:`\eta_t = \frac{1}{\sqrt{t}}`.""" return 1.0 / sqrt(max(1, self.t)) @property def trusts(self): r""" Trusts probabilities :math:`\mathrm{trusts}(t+1)` are just the normalized weights :math:`w_k(t)`. """ return self.weights
[docs] def getReward(self, arm, reward): r""" Give a reward: accumulate rewards on that arm k, then recompute the trusts. Compute the trusts probabilities :math:`w_k(t)` with one step of Online-Mirror-Descent for bandit, using the :math:`\alpha` Tsallis entropy for the :math:`\Psi_t` functions. .. math:: \mathrm{trusts}'_k(t+1) &= \nabla (\Psi_t + \mathcal{I}_{\Delta^K})^* (- \hat{L}_{t-1}), \\ \mathrm{trusts}(t+1) &= \mathrm{trusts}'(t+1) / \sum_{k=1}^{K} \mathrm{trusts}'_k(t+1). - If :math:`\Delta^K` is the probability simplex of dimension :math:`K`, - and :math:`\hat{L}_{t-1}` is the cumulative loss vector, ie, the sum of the (unbiased estimate) :math:`\hat{\ell}_t` for the previous time steps, - where :math:`\hat{\ell}_{t,i} = 1(I_t = i) \frac{\ell_{t,i}}{\mathrm{trusts}_i(t)}` is the unbiased estimate of the loss, - With :math:`\Psi_t = \Psi_{t,\alpha}(w) := - \sum_{k=1}^{K} \frac{w_k^{\alpha}}{\alpha \eta_t}`, - With learning rate :math:`\eta_t = \frac{1}{\sqrt{t}}` the (decreasing) learning rate. """ super(TsallisInf, self).getReward(arm, reward) # XXX Call to Exp3 # normalize reward to [0,1] reward = (reward - self.lower) / self.amplitude # for one reward in [0,1], loss = 1 - reward biased_loss = 1.0 - reward # unbiased estimate, from the weights of the previous step unbiased_loss = biased_loss / self.weights[arm] self.cumulative_losses[arm] += unbiased_loss eta_t = self.eta # 1. solve f(x)=1 to get an approximation of the (unique) Lagrange multiplier x def objective_function(x): return (np.sum( (eta_t * (self.cumulative_losses - x)) ** self.inverse_exponent ) - 1) ** 2 result_of_minimization = opt.minimize_scalar(objective_function) # result_of_minimization = opt.minimize(objective_function, 0.0) # XXX is it not faster? x = result_of_minimization.x # 2. use x to compute the new weights new_weights = ( eta_t * (self.cumulative_losses - x) ) ** self.inverse_exponent # print("DEBUG: {} at time {} (seeing reward {} on arm {}), compute slack variable x = {}, \n and new_weights = {}...".format(self, self.t, reward, arm, x, new_weights)) # DEBUG # XXX Handle weird cases, slow down everything but safer! if not np.all(np.isfinite(new_weights)): new_weights[~np.isfinite(new_weights)] = 0 # set bad values to 0 # Bad case, where the sum is so small that it's only rounding errors # or where all values where bad and forced to 0, start with new_weights=[1/K...] if np.isclose(np.sum(new_weights), 0): # Normalize it! new_weights[:] = 1.0 # 3. Renormalize weights at each step new_weights /= np.sum(new_weights) # 4. store weights self.weights = new_weights