Source code for Policies.MEGA

# -*- coding: utf-8 -*-
""" MEGA: implementation of the single-player policy from [Concurrent bandits and cognitive radio network, O.Avner & S.Mannor, 2014](https://arxiv.org/abs/1404.5421).

The Multi-user epsilon-Greedy collision Avoiding (MEGA) algorithm is based on the epsilon-greedy algorithm introduced in [2], augmented by a collision avoidance mechanism that is inspired by the classical ALOHA protocol.

- [2]: Finite-time analysis of the multi-armed bandit problem, P.Auer & N.Cesa-Bianchi & P.Fischer, 2002
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"

from random import random
import numpy as np
import numpy.random as rn

try:
    from .BasePolicy import BasePolicy
    from .with_proba import with_proba
except ImportError:
    from BasePolicy import BasePolicy
    from with_proba import with_proba


# --- Class MEGA

[docs]class MEGA(BasePolicy):
    """ MEGA: implementation of the single-player policy from [Concurrent bandits and cognitive radio network, O.Avner & S.Mannor, 2014](https://arxiv.org/abs/1404.5421).
    """

[docs]    def __init__(self, nbArms, p0=0.5, alpha=0.5, beta=0.5, c=0.1, d=0.01, lower=0., amplitude=1.):  # Named argument to give them in any order
        """
        - nbArms: number of arms.
        - p0: initial probability p(0); p(t) is the probability of persistance on the chosenArm at time t
        - alpha: scaling in the update for p(t+1) <- alpha p(t) + (1 - alpha(t))
        - beta: exponent used in the interval [t, t + t^beta], from where to sample a random time t_next(k), until when the chosenArm is unavailable
        - c, d: used to compute the exploration probability epsilon_t, cf the function :func:`_epsilon_t`.

        Example:

        >>> nbArms, p0, alpha, beta, c, d = 17, 0.5, 0.5, 0.5, 0.1, 0.01
        >>> player1 = MEGA(nbArms, p0, alpha, beta, c, d)

        For multi-players use:

        >>> configuration["players"] = Selfish(NB_PLAYERS, MEGA, nbArms, p0, alpha, beta, c, d).children
        """
        # Store parameters
        super(MEGA, self).__init__(nbArms, lower=lower, amplitude=amplitude)
        self.c = c  #: Parameter c
        # FIXME we should not require two parameters, as they are just used in the formula for epsilon_t
        self.d = d  #: Parameter d
        assert 0 <= p0 <= 1, "Error: parameter 'p0' for a MEGA player should be in [0, 1]."  # DEBUG
        self.p0 = p0  #: Parameter p0, should not be modified
        self.p = p0  #: Parameter p, can be modified
        assert 0 < alpha <= 1, "Error: parameter 'alpha' for a MEGA player should be in (0, 1]."  # DEBUG
        self.alpha = alpha  #: Parameter alpha
        assert 0 < beta <= 1, "Error: parameter 'beta' for a MEGA player should be in (0, 1]."  # DEBUG
        self.beta = beta  #: Parameter beta
        # Internal memory
        self.chosenArm = None  #: Last chosen arm
        self.tnext = np.ones(nbArms, dtype=int)  #: Only store the delta time
        self.meanRewards = np.zeros(nbArms)  #: Mean rewards

[docs]    def __str__(self):
        return r"MEGA($c={:.3g}$, $d={:.3g}$, $p_0={:.3g}$, $\alpha={:.3g}$, $\beta={:.3g}$)".format(self.c, self.d, self.p0, self.alpha, self.beta)

[docs]    def startGame(self):
        """ Just reinitialize all the internal memory."""
        super(MEGA, self).startGame()
        self.p = self.p0
        self.chosenArm = rn.randint(self.nbArms)  # Start on a random arm
        self.tnext.fill(1)
        self.meanRewards.fill(float('-inf'))  # Null reward if not pulled

[docs]    def choice(self):
        """ Choose an arm, as described by the MEGA algorithm."""
        self.t += 1
        if self.chosenArm is not None:  # We can still exploit that arm
            return self.chosenArm
        else:  # We have to chose a new arm
            # Identify available arms
            availableArms = np.nonzero(self.tnext <= self.t)[0]
            if len(availableArms) == 0:
                print("Error: MEGA.choice() should 'Refrain from transmitting in this round' but my model does not allow this - YET ... Choosing a random arm.")  # DEBUG
                self.chosenArm = rn.randint(self.nbArms)  # XXX Choose a random arm
                # raise ValueError("FIXME MEGA.choice() should 'Refrain from transmitting in this round' but my model does not allow this - YET")
            else:  # There is some available arms
                epsilon = self._epsilon_t()
                if with_proba(epsilon):  # With proba epsilon_t
                    newArm = rn.choice(availableArms)  # Explore valid arms
                    if self.chosenArm != newArm:
                        self.p = self.p0  # Reinitialize proba p
                else:  # Exploit: select the arm with highest meanRewards
                    self.meanRewards[self.pulls != 0] = self.rewards[self.pulls != 0] / self.pulls[self.pulls != 0]
                    # newArm = np.argmax(self.meanRewards)
                    # Uniformly chosen if more than one arm has the highest index, but that's unlikely
                    newArm = np.random.choice(np.nonzero(self.meanRewards == np.max(self.meanRewards))[0])
                self.chosenArm = newArm
            return self.chosenArm

[docs]    def getReward(self, arm, reward):
        """ Receive a reward on arm of index 'arm', as described by the MEGA algorithm.

        - If not collision, receive a reward after pulling the arm.
        """
        assert self.chosenArm == arm, "Error: a MEGA player can only get a reward on her chosenArm. Here, arm = {} != chosenArm = {} ...".format(arm, self.chosenArm)  # DEBUG
        # print("- A MEGA player receive reward = {} on arm {}, and time t = {}...".format(reward, arm, self.t))  # DEBUG
        self.rewards[arm] += (reward - self.lower) / self.amplitude
        self.pulls[arm] += 1
        self.p = self.p * self.alpha + (1 - self.alpha)  # Update proba p

[docs]    def handleCollision(self, arm, reward=None):
        """ Handle a collision, on arm of index 'arm'.

        - Warning: this method has to be implemented in the collision model, it is NOT implemented in the EvaluatorMultiPlayers.

        .. note:: We do not care on which arm the collision occured.

        """
        assert self.chosenArm == arm, "Error: a MEGA player can only see a collision on her chosenArm. Here, arm = {} != chosenArm = {} ...".format(arm, self.chosenArm)  # DEBUG
        # print("- A MEGA player saw a collision on arm {}, and time t = {} ...".format(arm, self.t))  # DEBUG
        # # 1. With proba p, persist  XXX useless code
        # # if with_proba(self.p):
        # #     self.chosenArm = self.chosenArm
        # 2. With proba 1 - p, give up
        if with_proba(1 - self.p):
            # Random time offset until when this arm self.chosenArm is not sampled
            delta_tnext_k = rn.randint(low=0, high=1 + int(self.t**self.beta))
            self.tnext[self.chosenArm] = self.t + delta_tnext_k
            # Reinitialize the proba p
            self.p = self.p0
            # We give up this arm
            self.chosenArm = None

    # --- Internal methods

[docs]    def _epsilon_t(self):
        """Compute the value of decreasing epsilon(t), cf. Algorithm 1 in [Avner & Mannor, 2014](https://arxiv.org/abs/1404.5421)."""
        return min(1, (self.c * self.nbArms**2) / (self.d**2 * (self.nbArms - 1) * self.t))
        # assert 0 <= epsilon <= 1, "Error, epsilon_t({}, {}, {}, {}) computed an epsilon = {} which is NOT in [0, 1] ...".format(c, d, nbArms, t, epsilon)  # DEBUG