Source code for Policies.Experimentals.KLempUCB

# -*- coding: utf-8 -*-
""" The Empirical KL-UCB algorithm non-parametric policy.
Reference: [Maillard, Munos & Stoltz - COLT, 2011], [Cappé, Garivier,  Maillard, Munos & Stoltz, 2012].
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Olivier Cappé, Aurélien Garivier, Lilian Besson"
__version__ = "0.1"

# WARNING: this is a HUGE hack to fix a mystery bug on importing this policy
from sys import path
from os.path import dirname
path.insert(0, '/'.join(dirname(__file__).split('/')[:-1]))
import numpy as np

try:
    from .kullback import maxEV   # XXX Not detected as in the kullback.py file ?
    from .IndexPolicy import IndexPolicy
except ImportError:
    from kullback import maxEV   # XXX Not detected as in the kullback.py file ?
    from IndexPolicy import IndexPolicy


[docs]class KLempUCB(IndexPolicy):
    """ The Empirical KL-UCB algorithm non-parametric policy.
    References: [Maillard, Munos & Stoltz - COLT, 2011], [Cappé, Garivier,  Maillard, Munos & Stoltz, 2012].
    """

[docs]    def __init__(self, nbArms, maxReward=1., lower=0., amplitude=1.):
        super(KLempUCB, self).__init__(nbArms, lower=lower, amplitude=amplitude)
        self.c = 1  #: Parameter c
        self.maxReward = maxReward  #: Known upper bound on the rewards
        self.pulls = np.zeros(self.nbArms, dtype=int)  #: Keep track of pulls of each arm
        #: UNBOUNDED dictionnary for each arm: keep track of how many observation of each rewards were seen.
        #: Warning: KLempUCB works better for *discrete* distributions!
        self.obs = [dict()] * self.nbArms

[docs]    def startGame(self):
        """ Initialize the policy for a new game."""
        self.t = 0
        self.pulls.fill(0)
        for arm in range(self.nbArms):
            self.obs[arm] = {self.maxReward: 0}

[docs]    def computeIndex(self, arm):
        r""" Compute the current index, at time t and after :math:`N_k(t)` pulls of arm k."""
        if self.pulls[arm] < 1:
            return float('+infinity')
        else:
            return self._KLucb(self.obs[arm], self.c * np.log(self.t) / self.pulls[arm])

[docs]    def getReward(self, arm, reward):
        """ Give a reward: increase t, pulls, and update count of observations for that arm."""
        self.t += 1
        self.pulls[arm] += 1
        self.obs[arm][reward] = 1 + self.obs[arm].get(reward, 0)

    # FIXME this does not work apparently...
[docs]    @staticmethod
    def _KLucb(obs, klMax, debug=False):
        """ Optimization method."""
        p = np.array(list(obs.values()), dtype=float)
        p /= np.sum(p)
        v = np.array(list(obs.keys()), dtype=float)
        if debug:
            print("Calling maxEV(", p, ", ", v, ", ", klMax, ") ...")
        q = maxEV(p, v, klMax)
        # if debug:
        #     q2 = kbp.maxEV(p, v, klMax)
        #     if max(abs(q - q2)) > 1e-8:
        #         print("ERROR: for p=", p, " ,v = ", v, " and klMax = ", klMax, " : ")
        #         print("q = ", q)
        #         print("q2 = ", q2)
        #         print("_____________________________")
        #         print("q = ", q)
        return np.dot(q, v)