Source code for Policies.MOSSExperimental

# -*- coding: utf-8 -*-
""" The MOSS-Experimental policy for bounded bandits, without knowing the horizon (and no doubling trick).
Reference: [Degenne & Perchet, 2016](http://proceedings.mlr.press/v48/degenne16.pdf).

.. warning:: Nothing was proved for this heuristic!
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"

from numpy import sqrt, log
import numpy as np
np.seterr(divide='ignore')  # XXX dangerous in general, controlled here!

try:
    from .MOSS import MOSS
except ImportError:
    from MOSS import MOSS


[docs]class MOSSExperimental(MOSS): """ The MOSS-Experimental policy for bounded bandits, without knowing the horizon (and no doubling trick). Reference: [Degenne & Perchet, 2016](http://proceedings.mlr.press/v48/degenne16.pdf). """
[docs] def __str__(self): return "MOSS-Experimental"
[docs] def computeIndex(self, arm): r""" Compute the current index, at time t and after :math:`N_k(t)` pulls of arm k, if there is K arms: .. math:: I_k(t) &= \frac{X_k(t)}{N_k(t)} + \sqrt{ \max\left(0, \frac{\log\left(\frac{t}{\hat{H}(t)}\right)}{N_k(t)}\right)},\\ \text{where}\;\; \hat{H}(t) &:= \begin{cases} \sum\limits_{j=1, N_j(t) < \sqrt{t}}^{K} N_j(t) & \;\text{if it is}\; > 0,\\ K N_k(t) & \;\text{otherwise}\; \end{cases} .. note:: In the article, the authors do not explain this subtlety, and I don't see an argument to justify that at anytime, :math:`\hat{H}(t) > 0` ie to justify that there is always some arms :math:`j` such that :math:`0 < N_j(t) < \sqrt{t}`. """ if self.pulls[arm] < 1: return float('+inf') else: pulls_of_suboptimal_arms = np.sum(self.pulls[self.pulls < np.sqrt(self.t)]) if pulls_of_suboptimal_arms > 0: return (self.rewards[arm] / self.pulls[arm]) + np.sqrt(0.5 * max(0, np.log(self.t / pulls_of_suboptimal_arms)) / self.pulls[arm]) else: return (self.rewards[arm] / self.pulls[arm]) + np.sqrt(0.5 * max(0, np.log(self.t / (self.nbArms * self.pulls[arm]))) / self.pulls[arm])
[docs] def computeAllIndex(self): """ Compute the current indexes for all arms, in a vectorized manner.""" pulls_of_suboptimal_arms = np.sum(self.pulls[self.pulls < np.sqrt(self.t)]) if pulls_of_suboptimal_arms > 0: indexes = (self.rewards / self.pulls) + np.sqrt(0.5 * np.maximum(0, np.log(self.t / pulls_of_suboptimal_arms)) / self.pulls) else: indexes = (self.rewards / self.pulls) + np.sqrt(0.5 * np.maximum(0, np.log(self.t / (self.nbArms * self.pulls))) / self.pulls) indexes[self.pulls < 1] = float('+inf') self.index[:] = indexes