Source code for PoliciesMultiPlayers.RandTopMEst

# -*- coding: utf-8 -*-
r""" RandTopMEstEst: four proposals for an efficient multi-players learning policy. :class:`RandTopMEstEst` and :class:`MCTopMEstEst` are the two main algorithms, with variants (see below).

- Each child player is selfish, and plays according to an index policy (any index policy, e.g., UCB, Thompson, KL-UCB, BayesUCB etc),
- But instead of aiming at the best (the 1-st best) arm, player i constantly aims at *one* of the M best arms (denoted :math:`\hat{M}^j(t)`, according to its index policy of indexes :math:`g^j_k(t)` (where M is the number of players),
- When a collision occurs or when the currently chosen arm lies outside of the current estimate of the set M-best, a new current arm is chosen.
- The (fixed) number of players is learned on the run.

.. note:: This is **fully decentralized**: player do not need to know the (fixed) number of players!

- Reference: [[Multi-Player Bandits Revisited, Lilian Besson and Emilie Kaufmann, 2017]](https://hal.inria.fr/hal-01629733)

.. warning:: This is still very experimental!

.. note:: For a more generic approach, see the wrapper defined in :class:`EstimateM.EstimateM`.
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.8"

import numpy as np

try:
    from .BaseMPPolicy import BaseMPPolicy
    from .ChildPointer import ChildPointer
    from .RandTopM import oneRandTopM
    from .EstimateM import threshold_on_t_with_horizon, threshold_on_t_doubling_trick, threshold_on_t
except ImportError:
    from BaseMPPolicy import BaseMPPolicy
    from ChildPointer import ChildPointer
    from RandTopM import oneRandTopM
    from EstimateM import threshold_on_t_with_horizon, threshold_on_t_doubling_trick, threshold_on_t


[docs]class oneRandTopMEst(oneRandTopM):
    r""" Class that acts as a child policy, but in fact it pass all its method calls to the mother class, who passes it to its i-th player.

    - The procedure to estimate :math:`\hat{M}_i(t)` is not so simple, but basically everyone starts with :math:`\hat{M}_i(0) = 1`, and when colliding :math:`\hat{M}_i(t+1) = \hat{M}_i(t) + 1`, for some time (with a complicated threshold).
    """

[docs]    def __init__(self, threshold, *args, **kwargs):
        if 'horizon' in kwargs:
            self.horizon = kwargs['horizon']
            del kwargs['horizon']
        else:
            self.horizon = None
        super(oneRandTopMEst, self).__init__(*args, **kwargs)
        # Parameters
        self.maxRank = 1
        self.threshold = threshold  #: Threshold function
        # Internal variables
        self.nbPlayersEstimate = 1  #: Number of players. Optimistic: start by assuming it is alone!
        self.collisionCount = np.zeros(self.nbArms, dtype=int)  #: Count collisions on each arm, since last increase of nbPlayersEstimate
        self.timeSinceLastCollision = 0  #: Time since last collision. Don't remember why I thought using this could be useful... But it's not!
        self.t = 0  #: Internal time

[docs]    def __str__(self):   # Better to recompute it automatically
        player = self.mother._players[self.playerId]
        Mbest_is_incorrect = self.t < self.nbArms or np.any(np.isinf(player.index)) or np.any(np.isnan(player.index))
        str_Mbest = "" if Mbest_is_incorrect else r", $M$-best: ${}$".format(list(self.Mbest()))
        str_chosen_arm = r", arm: ${}$".format(self.chosen_arm) if self.chosen_arm is not None else ""
        # WARNING remember to change here when defining a new variant of RandTopM
        return r"#{}<{}TopM{}{}-Est{}-{}{}{}{}>".format(self.playerId + 1, "MC" if self._withChair else "Rand", ("ExtraCautious" if self._exitIfWorstWasPicked else "Cautious") if self._pickWorstFirst else "", "" if self._pickPrevWorstFirst else "Old", "Plus" if self.horizon else "", player, str_Mbest, str_chosen_arm, "($T={}$)".format(self.horizon) if self.horizon else "")

[docs]    def startGame(self):
        """Start game."""
        super(oneRandTopMEst, self).startGame()
        # Est part
        self.nbPlayersEstimate = self.maxRank = 1  # Optimistic: start by assuming it is alone!
        self.collisionCount.fill(0)
        self.timeSinceLastCollision = 0
        self.t = 0

[docs]    def handleCollision(self, arm, reward=None):
        """Select a new rank, and maybe update nbPlayersEstimate."""
        super(oneRandTopMEst, self).handleCollision(arm, reward=reward)

        # we can be smart, and stop all this as soon as M = K !
        if self.nbPlayersEstimate < self.nbArms:
            self.collisionCount[arm] += 1
            # print("\n - A oneRandTopMEst player {} saw a collision on {}, since last update of nbPlayersEstimate = {} it is the {} th collision on that arm {}...".format(self, arm, self.nbPlayersEstimate, self.collisionCount[arm], arm))  # DEBUG

            # Then, estimate the current ranking of the arms and the set of the M best arms
            currentBest = self.estimatedBestArms(self.nbPlayersEstimate)
            # print("Current estimation of the {} best arms is {} ...".format(self.nbPlayersEstimate, currentBest))  # DEBUG

            collisionCount_on_currentBest = np.sum(self.collisionCount[currentBest])
            # print("Current count of collision on the {} best arms is {} ...".format(self.nbPlayersEstimate, collisionCount_on_currentBest))  # DEBUG

            # And finally, compare the collision count with the current threshold
            threshold = self.threshold(self.t, self.nbPlayersEstimate, self.horizon)
            # print("Using timeSinceLastCollision = {}, and t = {}, threshold = {:.3g} ...".format(self.timeSinceLastCollision, self.t, threshold))

            if collisionCount_on_currentBest > threshold:
                self.nbPlayersEstimate = self.maxRank = min(1 + self.nbPlayersEstimate, self.nbArms)
                # print("The collision count {} was larger than the threshold {:.3g} se we restart the collision count, and increase the nbPlayersEstimate to {}.".format(collisionCount_on_currentBest, threshold, self.nbPlayersEstimate))  # DEBUG
                self.collisionCount.fill(0)
            # Finally, restart timeSinceLastCollision
            self.timeSinceLastCollision = 0

[docs]    def getReward(self, arm, reward):
        """One transmission without collision."""
        self.t += 1
        # Obtaining a reward, even 0, means no collision on that arm for this time
        # So, first, we count one more step without collision
        self.timeSinceLastCollision += 1
        # print("Time since last collision = {} ...".format(self.timeSinceLastCollision))  # DEBUG
        # Then use the reward for the arm learning algorithm
        return super(oneRandTopMEst, self).getReward(arm, reward)


# --- Class RandTopMEst


#: Whether to use or not the variant with the "chair": after using an arm successfully (no collision), a player won't move after future collisions (she assumes the other will move). But she will still change her chosen arm if it lies outside of the estimated M-best. :class:`RandTopMEst` (and variants) uses `False` and :class:`MCTopMEst` (and variants) uses `True`.
WITH_CHAIR = True
WITH_CHAIR = False


#: XXX First experimental idea: when the currently chosen arm lies outside of the estimated Mbest set, force to first try (at least once) the arm with lowest UCB indexes in this Mbest_j(t) set. Used by :class:`RandTopMEstCautious` and :class:`RandTopMEstExtraCautious`, and by :class:`MCTopMEstCautious` and :class:`MCTopMEstExtraCautious`.
OPTIM_PICK_WORST_FIRST = True
OPTIM_PICK_WORST_FIRST = False


#: XXX Second experimental idea: when the currently chosen arm becomes the worst of the estimated Mbest set, leave it (even before it lies outside of Mbest_j(t)). Used by :class:`RandTopMEstExtraCautious` and :class:`MCTopMEstExtraCautious`.
OPTIM_EXIT_IF_WORST_WAS_PICKED = True
OPTIM_EXIT_IF_WORST_WAS_PICKED = False


#: XXX Third experimental idea: when the currently chosen arm becomes the worst of the estimated Mbest set, leave it (even before it lies outside of Mbest_j(t)). **Default now!**. `False` only for :class:`RandTopMEstOld` and :class:`MCTopMEstOld`.
OPTIM_PICK_PREV_WORST_FIRST = False
OPTIM_PICK_PREV_WORST_FIRST = True

[docs]class RandTopMEst(BaseMPPolicy):
    """ RandTopMEst: a proposal for an efficient multi-players learning policy, with no prior knowledge of the number of player.
    """

[docs]    def __init__(self, nbPlayers, nbArms, playerAlgo,
                 withChair=WITH_CHAIR,
                 pickWorstFirst=OPTIM_PICK_WORST_FIRST,
                 exitIfWorstWasPicked=OPTIM_EXIT_IF_WORST_WAS_PICKED,
                 pickPrevWorstFirst=OPTIM_PICK_PREV_WORST_FIRST,
                 threshold=threshold_on_t_doubling_trick, lower=0., amplitude=1.,
                 *args, **kwargs):
        """
        - nbPlayers: number of players to create (in self._players).
        - playerAlgo: class to use for every players.
        - nbArms: number of arms, given as first argument to playerAlgo.
        - withChair: see ``WITH_CHAIR``,
        - pickWorstFirst: see ``OPTIM_PICK_WORST_FIRST``,
        - exitIfWorstWasPicked: see ``EXIT_IF_WORST_WAS_PICKED``,
        - pickPrevWorstFirst: see ``OPTIM_PICK_PREV_WORST_FIRST``,
        - threshold: the threshold function to use, see :func:`EstimateM.threshold_on_t_with_horizon`, :func:`EstimateM.threshold_on_t_doubling_trick` or :func:`EstimateM.threshold_on_t` above.
        - `*args`, `**kwargs`: arguments, named arguments, given to playerAlgo.

        Example:

        >>> from Policies import *
        >>> import random; random.seed(0); import numpy as np; np.random.seed(0)
        >>> nbArms = 17
        >>> nbPlayers = 6
        >>> s = RandTopMEst(nbPlayers, nbArms, UCB)
        >>> [ child.choice() for child in s.children ]
        [12, 15, 0, 3, 3, 7]

        - To get a list of usable players, use ``s.children``.

        .. warning:: ``s._players`` is for internal use ONLY!
        """
        assert nbPlayers > 0, "Error, the parameter 'nbPlayers' for RandTopMEst class has to be > 0."  # DEBUG
        self.nbPlayers = nbPlayers  #: Number of players
        self.withChair = withChair  #: Using a chair ?
        self.pickWorstFirst = pickWorstFirst  #: Using first optimization ?
        self.exitIfWorstWasPicked = exitIfWorstWasPicked  #: Using second optimization ?
        self.pickPrevWorstFirst = pickPrevWorstFirst  #: Using third optimization ? Default to yes now.
        self._players = [None] * nbPlayers
        self.children = [None] * nbPlayers  #: List of children, fake algorithms
        self.nbArms = nbArms  #: Number of arms
        fake_maxRank = None
        for playerId in range(nbPlayers):
            self._players[playerId] = playerAlgo(nbArms, *args, **kwargs)
            self.children[playerId] = oneRandTopMEst(threshold, fake_maxRank, withChair, pickWorstFirst, exitIfWorstWasPicked, pickPrevWorstFirst, self, playerId)

[docs]    def __str__(self):
        return "RandTopMEst({} x {})".format(self.nbPlayers, str(self._players[0]))

[docs]class RandTopMEstPlus(BaseMPPolicy):
    """ RandTopMEstPlus: a proposal for an efficient multi-players learning policy, with no prior knowledge of the number of player.
    """

[docs]    def __init__(self, nbPlayers, nbArms, playerAlgo, horizon,
                 withChair=WITH_CHAIR,
                 pickWorstFirst=OPTIM_PICK_WORST_FIRST,
                 exitIfWorstWasPicked=OPTIM_EXIT_IF_WORST_WAS_PICKED,
                 pickPrevWorstFirst=OPTIM_PICK_PREV_WORST_FIRST,
                 lower=0., amplitude=1.,
                 *args, **kwargs):
        """
        - nbPlayers: number of players to create (in self._players).
        - playerAlgo: class to use for every players.
        - nbArms: number of arms, given as first argument to playerAlgo.
        - horizon: need to know the horizon :math:`T`.
        - withChair: see ``WITH_CHAIR``,
        - pickWorstFirst: see ``OPTIM_PICK_WORST_FIRST``,
        - exitIfWorstWasPicked: see ``EXIT_IF_WORST_WAS_PICKED``,
        - pickPrevWorstFirst: see ``OPTIM_PICK_PREV_WORST_FIRST``,
        - threshold: the threshold function to use, see :func:`threshold_on_t_with_horizon` or :func:`threshold_on_t` above.
        - `*args`, `**kwargs`: arguments, named arguments, given to playerAlgo.

        Example:

        >>> from Policies import *
        >>> import random; random.seed(0); import numpy as np; np.random.seed(0)
        >>> nbArms = 17
        >>> nbPlayers = 6
        >>> horizon = 1000
        >>> s = RandTopMEstPlus(nbPlayers, nbArms, UCB, horizon)
        >>> [ child.choice() for child in s.children ]
        [12, 15, 0, 3, 3, 7]

        - To get a list of usable players, use ``s.children``.

        .. warning:: ``s._players`` is for internal use ONLY!
        """
        assert nbPlayers > 0, "Error, the parameter 'nbPlayers' for RandTopMEstPlus class has to be > 0."  # DEBUG
        self.nbPlayers = nbPlayers  #: Number of players
        self.withChair = withChair  #: Using a chair ?
        self.pickWorstFirst = pickWorstFirst  #: Using first optimization ?
        self.exitIfWorstWasPicked = exitIfWorstWasPicked  #: Using second optimization ?
        self.pickPrevWorstFirst = pickPrevWorstFirst  #: Using third optimization ? Default to yes now.
        self._players = [None] * nbPlayers
        self.children = [None] * nbPlayers  #: List of children, fake algorithms
        self.nbArms = nbArms  #: Number of arms
        fake_maxRank = None
        for playerId in range(nbPlayers):
            self._players[playerId] = playerAlgo(nbArms, *args, **kwargs)
            self.children[playerId] = oneRandTopMEst(threshold_on_t_with_horizon, fake_maxRank, withChair, pickWorstFirst, exitIfWorstWasPicked, pickPrevWorstFirst, self, playerId, horizon=horizon)

[docs]    def __str__(self):
        return "RandTopMEstPlus({} x {})".format(self.nbPlayers, str(self._players[0]))


[docs]class MCTopMEst(RandTopMEst):
    """ MCTopMEst: another proposal for an efficient multi-players learning policy, more "stationary" than RandTopMEst.

    .. warning:: Still very experimental! But it seems to be the most efficient decentralized MP algorithm we have so far...
    """

[docs]    def __init__(self, nbPlayers, nbArms, playerAlgo,
                 lower=0., amplitude=1.,
                 *args, **kwargs):
        """
        - nbPlayers: number of players to create (in self._players).
        - playerAlgo: class to use for every players.
        - nbArms: number of arms, given as first argument to playerAlgo.
        - `*args`, `**kwargs`: arguments, named arguments, given to playerAlgo.
        """
        super(MCTopMEst, self).__init__(nbPlayers, nbArms, playerAlgo, withChair=True, pickWorstFirst=False, exitIfWorstWasPicked=False, pickPrevWorstFirst=True, *args, **kwargs)

[docs]    def __str__(self):
        return "MCTopMEst({} x {})".format(self.nbPlayers, str(self._players[0]))


[docs]class MCTopMEstPlus(RandTopMEstPlus):
    """ MCTopMEstPlus: another proposal for an efficient multi-players learning policy, more "stationary" than RandTopMEst.

    .. warning:: Still very experimental! But it seems to be the most efficient decentralized MP algorithm we have so far...
    """

[docs]    def __init__(self, nbPlayers, nbArms, playerAlgo, horizon,
                 lower=0., amplitude=1.,
                 *args, **kwargs):
        """
        - nbPlayers: number of players to create (in self._players).
        - playerAlgo: class to use for every players.
        - nbArms: number of arms, given as first argument to playerAlgo.
        - `*args`, `**kwargs`: arguments, named arguments, given to playerAlgo.
        """
        super(MCTopMEstPlus, self).__init__(nbPlayers, nbArms, playerAlgo, horizon, withChair=True, pickWorstFirst=False, exitIfWorstWasPicked=False, pickPrevWorstFirst=True, *args, **kwargs)

[docs]    def __str__(self):
        return "MCTopMEstPlus({} x {})".format(self.nbPlayers, str(self._players[0]))