Source code for Policies.DiscountedBayesianIndexPolicy

# -*- coding: utf-8 -*-
""" Discounted Bayesian index policy.

- By default, it uses a DiscountedBeta posterior (:class:`Policies.Posterior.DiscountedBeta`), one by arm.
- Use discount factor :math:`\gamma\in(0,1)`.

.. warning:: This is still highly experimental!
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"

try:
    from .BayesianIndexPolicy import BayesianIndexPolicy
    from .Posterior import DiscountedBeta
except ImportError:
    from BayesianIndexPolicy import BayesianIndexPolicy
    from Posterior import DiscountedBeta


# --- Constants

#: Default value for the discount factor :math:`\gamma\in(0,1)`.
#: ``0.95`` is empirically a reasonable value for short-term non-stationary experiments.
GAMMA = 0.95


# --- Class

[docs]class DiscountedBayesianIndexPolicy(BayesianIndexPolicy): r""" Discounted Bayesian index policy. - By default, it uses a DiscountedBeta posterior (:class:`Policies.Posterior.DiscountedBeta`), one by arm. - Use discount factor :math:`\gamma\in(0,1)`. - It keeps :math:`\widetilde{S_k}(t)` and :math:`\widetilde{F_k}(t)` the discounted counts of successes and failures (S and F), for each arm k. - But instead of using :math:`\widetilde{S_k}(t) = S_k(t)` and :math:`\widetilde{N_k}(t) = N_k(t)`, they are updated at each time step using the discount factor :math:`\gamma`: .. math:: \widetilde{S_{A(t)}}(t+1) &= \gamma \widetilde{S_{A(t)}}(t) + r(t),\\ \widetilde{S_{k'}}(t+1) &= \gamma \widetilde{S_{k'}}(t), \forall k' \neq A(t). .. math:: \widetilde{F_{A(t)}}(t+1) &= \gamma \widetilde{F_{A(t)}}(t) + (1 - r(t)),\\ \widetilde{F_{k'}}(t+1) &= \gamma \widetilde{F_{k'}}(t), \forall k' \neq A(t). """
[docs] def __init__(self, nbArms, gamma=GAMMA, posterior=DiscountedBeta, lower=0., amplitude=1., *args, **kwargs ): """ Create a new Bayesian policy, by creating a default posterior on each arm.""" super(DiscountedBayesianIndexPolicy, self).__init__(nbArms, posterior=posterior, lower=lower, amplitude=amplitude, gamma=gamma) assert 0 < gamma <= 1, "Error: for a DiscountedBayesianIndexPolicy policy, the discount factor has to be in [0,1], but it was {}.".format(gamma) # DEBUG if gamma == 1: print("Warning: gamma = 1 is stupid, just use a regular Beta posterior!") # DEBUG self.gamma = gamma #: Discount factor :math:`\gamma\in(0,1)`.
[docs] def __str__(self): """ -> str""" return r"{}($\gamma={:.5g}${})".format(self.__class__.__name__, self.gamma, self._posterior_name if self._posterior_name != "DiscountedBeta" else "")
[docs] def getReward(self, arm, reward): """ Update the posterior on each arm, with the normalized reward.""" self.posterior[arm].update((reward - self.lower) / self.amplitude) # DONE we should update the other posterior with "no observation" for otherArm in range(self.nbArms): if otherArm != arm: self.posterior[arm].discount() self.t += 1