Source code for Policies.WrapRange

# -*- coding: utf-8 -*-
r""" A policy that acts as a wrapper on another policy `P`, which requires to know the range :math:`[a, b]` of the rewards, by implementing a "doubling trick" to adapt to an unknown range of rewards.

It's an interesting variant of the "doubling trick", used to tackle another unknown aspect of sequential experiments: some algorithms need to use rewards in :math:`[0,1]`, and are easy to use if the rewards known to be in some interval :math:`[a, b]` (I did this from the very beginning here, with ``[lower, lower+amplitude]``).
But if the interval :math:`[a,b]` is unknown, what can we do?
The "Doubling Trick", in this setting, refers to this algorithm:

1. Start with :math:`[a_0, b_0] = [0, 1]`,
2. If a reward :math:`r_t` is seen below :math:`a_i`, use :math:`a_{i+1} = r_t`,
3. If a reward :math:`r_t` is seen above :math:`b_i`, use :math:`b_{i+1} = r_t - a_i`.

Instead of just doubling the length of the interval ("doubling trick"), we use :math:`[r_t, b_i]` or :math:`[a_i, r_t]` as it is the smallest interval compatible with the past and the new observation :math:`r_t`

- Reference.  I'm not sure which work is the first to have proposed this idea, but [[Normalized online learning, Stéphane Ross & Paul Mineiro & John Langford, 2013](https://arxiv.org/pdf/1305.6646.pdf)] proposes a similar idea.

.. seealso:: See for instance `Obandit.WrapRange <https://freuk.github.io/obandit/api.docdir/Obandit.WrapRange.html>`_ by `@freuk <https://github.com/freuk/>`_.
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"


import numpy as np

try:
    from .BasePolicy import BasePolicy
    from .UCB import UCB
except ImportError:
    from BasePolicy import BasePolicy
    from UCB import UCB

default_rangeDependent_policy = UCB


# --- The interesting class

[docs]class WrapRange(BasePolicy):
    r""" A policy that acts as a wrapper on another policy `P`, which requires to know the range :math:`[a, b]` of the rewards, by implementing a "doubling trick" to adapt to an unknown range of rewards.
    """

[docs]    def __init__(self, nbArms,
                 policy=default_rangeDependent_policy,
                 lower=0., amplitude=1.,
                 *args, **kwargs):
        super(WrapRange, self).__init__(nbArms, lower=lower, amplitude=amplitude)
        # --- Policy
        self._policy = policy  # Class to create the underlying policy
        self._args = args  # To keep them
        if 'params' in kwargs:
            kwargs.update(kwargs['params'])
            del kwargs['params']
        self._kwargs = kwargs  # To keep them
        self.policy = None  #: Underlying policy
        # --- Horizon
        self._i = 0
        # XXX Force it, just for pretty printing...
        self.startGame()

    # --- pretty printing

[docs]    def __str__(self):
        return r"WrapRange[{}]".format(self.policy)

    # --- Start game by creating new underlying policy

[docs]    def startGame(self):
        """ Initialize the policy for a new game."""
        super(WrapRange, self).startGame()
        self._i = 0  # reinitialize this
        self.policy = self._policy(self.nbArms, lower=self.lower, amplitude=self.amplitude, *self._args, **self._kwargs)
        # now also start game for the underlying policy
        self.policy.startGame()
        self.rewards = self.policy.rewards  # just pointers to the underlying arrays!
        self.pulls = self.policy.pulls      # just pointers to the underlying arrays!

    # --- Pass the call to the subpolicy

[docs]    def getReward(self, arm, reward):
        r""" Maybe change the current range and rescale all the past history, and then pass the reward, and update t.

        Let call :math:`r_s` the reward at time :math:`s`, :math:`l_{t-1}` and :math:`a_{t-1}` the lower-bound and amplitude of rewards at previous time :math:`t-1`, and :math:`l_t` and :math:`a_t` the new lower-bound and amplitude for current time :math:`t`.
        The previous history is :math:`R_t := \sum_{s=1}^{t-1} r_s`.

        The generic formula for rescaling the previous history is the following:

        .. math:: R_t := \frac{(a_{t-1} \times R_t + l_{t-1}) - l_t}{a_t}.

        So we have the following efficient algorithm:

        1. If :math:`r < l_{t-1}`, let :math:`l_t = r` and :math:`R_t := R_t + \frac{l_{t-1} - l_t}{a_t}`,
        2. Else if :math:`r > l_{t-1} + a_{t-1}`, let :math:`a_t = r - l_{t-1}` and :math:`R_t := R_t \times \frac{a_{t-1}}{a-t}`,
        3. Otherwise, nothing to do, the current reward is still correctly in :math:`[l_{t-1}, l_{t-1} + a_{t-1}]`, so simply keep :math:`l_t = l_{t-1}` and :math:`a_t = a_{t-1}`.
        """
        # print(" - At time t = {}, got a reward = {:.3g} from arm {} ...".format(self.t, reward, arm))  # DEBUG
        self.t += 1

        # check if the reward is still in [lower, lower + amplitude]
        if reward < self.lower or reward > self.lower + self.amplitude:
            self._i += 1  # just count how many change in range we seen
            old_l, old_a = self.lower, self.amplitude
            if reward < self.lower:
                print("Info: {} saw a reward {:.3g} below its lower value {:.3g}, now lower = {:.3g}... ({}th change)".format(self, reward, self.lower, reward, self._i))  # DEBUG
                self.lower = self.policy.lower = reward
                # Now we maybe have to rescale all the previous history!
                # 1st case: At+1=At, so just a small linear shift is enough
                self.rewards += (old_l - self.lower) / self.amplitude
            else:
                print("Info: {} saw a reward {:.3g} above its higher value {:.3g}, now amplitude = {:.3g}... ({}th change)".format(self, reward, self.lower + self.amplitude, reward - self.lower, self._i))  # DEBUG
                self.amplitude = self.policy.amplitude = reward - self.lower
                # Now we maybe have to rescale all the previous history!
                # 2nd case: Lt+1=Lt, so just a small multiplicative shift is enough
                # the shift is < 1, OK
                self.rewards *= old_a / self.amplitude

        self.policy.getReward(arm, reward)

    # --- Sub methods

    # This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
    @property
    def index(self):
        r""" Get attribute ``index`` from the underlying policy."""
        return self.policy.index

[docs]    def choice(self):
        r""" Pass the call to ``choice`` of the underlying policy."""
        return self.policy.choice()

[docs]    def choiceWithRank(self, rank=1):
        r""" Pass the call to ``choiceWithRank`` of the underlying policy."""
        return self.policy.choiceWithRank(rank=rank)

[docs]    def choiceFromSubSet(self, availableArms='all'):
        r""" Pass the call to ``choiceFromSubSet`` of the underlying policy."""
        return self.policy.choiceFromSubSet(availableArms=availableArms)

[docs]    def choiceMultiple(self, nb=1):
        r""" Pass the call to ``choiceMultiple`` of the underlying policy."""
        return self.policy.choiceMultiple(nb=nb)

[docs]    def choiceIMP(self, nb=1, startWithChoiceMultiple=True):
        r""" Pass the call to ``choiceIMP`` of the underlying policy."""
        return self.policy.choiceIMP(nb=nb, startWithChoiceMultiple=startWithChoiceMultiple)

[docs]    def estimatedOrder(self):
        r""" Pass the call to ``estimatedOrder`` of the underlying policy."""
        return self.policy.estimatedOrder()

[docs]    def estimatedBestArms(self, M=1):
        r""" Pass the call to ``estimatedBestArms`` of the underlying policy."""
        return self.policy.estimatedBestArms(M=M)

[docs]    def computeIndex(self, arm):
        r""" Pass the call to ``computeIndex`` of the underlying policy."""
        return self.policy.computeIndex(arm)

[docs]    def computeAllIndex(self):
        r""" Pass the call to ``computeAllIndex`` of the underlying policy."""
        return self.policy.computeAllIndex()