# Source code for Policies.WrapRange

```
# -*- coding: utf-8 -*-
r""" A policy that acts as a wrapper on another policy `P`, which requires to know the range :math:`[a, b]` of the rewards, by implementing a "doubling trick" to adapt to an unknown range of rewards.
It's an interesting variant of the "doubling trick", used to tackle another unknown aspect of sequential experiments: some algorithms need to use rewards in :math:`[0,1]`, and are easy to use if the rewards known to be in some interval :math:`[a, b]` (I did this from the very beginning here, with ``[lower, lower+amplitude]``).
But if the interval :math:`[a,b]` is unknown, what can we do?
The "Doubling Trick", in this setting, refers to this algorithm:
1. Start with :math:`[a_0, b_0] = [0, 1]`,
2. If a reward :math:`r_t` is seen below :math:`a_i`, use :math:`a_{i+1} = r_t`,
3. If a reward :math:`r_t` is seen above :math:`b_i`, use :math:`b_{i+1} = r_t - a_i`.
Instead of just doubling the length of the interval ("doubling trick"), we use :math:`[r_t, b_i]` or :math:`[a_i, r_t]` as it is the smallest interval compatible with the past and the new observation :math:`r_t`
- Reference. I'm not sure which work is the first to have proposed this idea, but [[Normalized online learning, Stéphane Ross & Paul Mineiro & John Langford, 2013](https://arxiv.org/pdf/1305.6646.pdf)] proposes a similar idea.
.. seealso:: See for instance `Obandit.WrapRange <https://freuk.github.io/obandit/api.docdir/Obandit.WrapRange.html>`_ by `@freuk <https://github.com/freuk/>`_.
"""
from __future__ import division, print_function # Python 2 compatibility
__author__ = "Lilian Besson"
__version__ = "0.9"
import numpy as np
try:
from .BasePolicy import BasePolicy
from .UCB import UCB
except ImportError:
from BasePolicy import BasePolicy
from UCB import UCB
default_rangeDependent_policy = UCB
# --- The interesting class
[docs]class WrapRange(BasePolicy):
r""" A policy that acts as a wrapper on another policy `P`, which requires to know the range :math:`[a, b]` of the rewards, by implementing a "doubling trick" to adapt to an unknown range of rewards.
"""
[docs] def __init__(self, nbArms,
policy=default_rangeDependent_policy,
lower=0., amplitude=1.,
*args, **kwargs):
super(WrapRange, self).__init__(nbArms, lower=lower, amplitude=amplitude)
# --- Policy
self._policy = policy # Class to create the underlying policy
self._args = args # To keep them
if 'params' in kwargs:
kwargs.update(kwargs['params'])
del kwargs['params']
self._kwargs = kwargs # To keep them
self.policy = None #: Underlying policy
# --- Horizon
self._i = 0
# XXX Force it, just for pretty printing...
self.startGame()
# --- pretty printing
# --- Start game by creating new underlying policy
[docs] def startGame(self):
""" Initialize the policy for a new game."""
super(WrapRange, self).startGame()
self._i = 0 # reinitialize this
self.policy = self._policy(self.nbArms, lower=self.lower, amplitude=self.amplitude, *self._args, **self._kwargs)
# now also start game for the underlying policy
self.policy.startGame()
self.rewards = self.policy.rewards # just pointers to the underlying arrays!
self.pulls = self.policy.pulls # just pointers to the underlying arrays!
# --- Pass the call to the subpolicy
[docs] def getReward(self, arm, reward):
r""" Maybe change the current range and rescale all the past history, and then pass the reward, and update t.
Let call :math:`r_s` the reward at time :math:`s`, :math:`l_{t-1}` and :math:`a_{t-1}` the lower-bound and amplitude of rewards at previous time :math:`t-1`, and :math:`l_t` and :math:`a_t` the new lower-bound and amplitude for current time :math:`t`.
The previous history is :math:`R_t := \sum_{s=1}^{t-1} r_s`.
The generic formula for rescaling the previous history is the following:
.. math:: R_t := \frac{(a_{t-1} \times R_t + l_{t-1}) - l_t}{a_t}.
So we have the following efficient algorithm:
1. If :math:`r < l_{t-1}`, let :math:`l_t = r` and :math:`R_t := R_t + \frac{l_{t-1} - l_t}{a_t}`,
2. Else if :math:`r > l_{t-1} + a_{t-1}`, let :math:`a_t = r - l_{t-1}` and :math:`R_t := R_t \times \frac{a_{t-1}}{a-t}`,
3. Otherwise, nothing to do, the current reward is still correctly in :math:`[l_{t-1}, l_{t-1} + a_{t-1}]`, so simply keep :math:`l_t = l_{t-1}` and :math:`a_t = a_{t-1}`.
"""
# print(" - At time t = {}, got a reward = {:.3g} from arm {} ...".format(self.t, reward, arm)) # DEBUG
self.t += 1
# check if the reward is still in [lower, lower + amplitude]
if reward < self.lower or reward > self.lower + self.amplitude:
self._i += 1 # just count how many change in range we seen
old_l, old_a = self.lower, self.amplitude
if reward < self.lower:
print("Info: {} saw a reward {:.3g} below its lower value {:.3g}, now lower = {:.3g}... ({}th change)".format(self, reward, self.lower, reward, self._i)) # DEBUG
self.lower = self.policy.lower = reward
# Now we maybe have to rescale all the previous history!
# 1st case: At+1=At, so just a small linear shift is enough
self.rewards += (old_l - self.lower) / self.amplitude
else:
print("Info: {} saw a reward {:.3g} above its higher value {:.3g}, now amplitude = {:.3g}... ({}th change)".format(self, reward, self.lower + self.amplitude, reward - self.lower, self._i)) # DEBUG
self.amplitude = self.policy.amplitude = reward - self.lower
# Now we maybe have to rescale all the previous history!
# 2nd case: Lt+1=Lt, so just a small multiplicative shift is enough
# the shift is < 1, OK
self.rewards *= old_a / self.amplitude
self.policy.getReward(arm, reward)
# --- Sub methods
# This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
@property
def index(self):
r""" Get attribute ``index`` from the underlying policy."""
return self.policy.index
[docs] def choice(self):
r""" Pass the call to ``choice`` of the underlying policy."""
return self.policy.choice()
[docs] def choiceWithRank(self, rank=1):
r""" Pass the call to ``choiceWithRank`` of the underlying policy."""
return self.policy.choiceWithRank(rank=rank)
[docs] def choiceFromSubSet(self, availableArms='all'):
r""" Pass the call to ``choiceFromSubSet`` of the underlying policy."""
return self.policy.choiceFromSubSet(availableArms=availableArms)
[docs] def choiceMultiple(self, nb=1):
r""" Pass the call to ``choiceMultiple`` of the underlying policy."""
return self.policy.choiceMultiple(nb=nb)
[docs] def choiceIMP(self, nb=1, startWithChoiceMultiple=True):
r""" Pass the call to ``choiceIMP`` of the underlying policy."""
return self.policy.choiceIMP(nb=nb, startWithChoiceMultiple=startWithChoiceMultiple)
[docs] def estimatedOrder(self):
r""" Pass the call to ``estimatedOrder`` of the underlying policy."""
return self.policy.estimatedOrder()
[docs] def estimatedBestArms(self, M=1):
r""" Pass the call to ``estimatedBestArms`` of the underlying policy."""
return self.policy.estimatedBestArms(M=M)
[docs] def computeIndex(self, arm):
r""" Pass the call to ``computeIndex`` of the underlying policy."""
return self.policy.computeIndex(arm)
[docs] def computeAllIndex(self):
r""" Pass the call to ``computeAllIndex`` of the underlying policy."""
return self.policy.computeAllIndex()
```