Source code for PoliciesMultiPlayers.RandTopM

# -*- coding: utf-8 -*-
r""" RandTopM: four proposals for an efficient multi-players learning policy. :class:`RandTopM` and :class:`MCTopM` are the two main algorithms, with variants (see below).

- Each child player is selfish, and plays according to an index policy (any index policy, e.g., UCB, Thompson, KL-UCB, BayesUCB etc),
- But instead of aiming at the best (the 1-st best) arm, player i constantly aims at *one* of the M best arms (denoted :math:`\hat{M}^j(t)`, according to its index policy of indexes :math:`g^j_k(t)` (where M is the number of players),
- When a collision occurs or when the currently chosen arm lies outside of the current estimate of the set M-best, a new current arm is chosen.

.. note:: This is not fully decentralized: as each child player needs to know the (fixed) number of players.

- Reference: [[Multi-Player Bandits Revisited, Lilian Besson and Emilie Kaufmann, 2017]](https://hal.inria.fr/hal-01629733)
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.8"

import numpy as np
import numpy.random as rn


try:
    from .BaseMPPolicy import BaseMPPolicy
    from .ChildPointer import ChildPointer
except ImportError:
    from BaseMPPolicy import BaseMPPolicy
    from ChildPointer import ChildPointer


#: Whether to use or not the variant with the "chair": after using an arm successfully (no collision), a player won't move after future collisions (she assumes the other will move). But she will still change her chosen arm if it lies outside of the estimated M-best. :class:`RandTopM` (and variants) uses `False` and :class:`MCTopM` (and variants) uses `True`.
WITH_CHAIR = True
WITH_CHAIR = False


#: XXX First experimental idea: when the currently chosen arm lies outside of the estimated Mbest set, force to first try (at least once) the arm with lowest UCB indexes in this Mbest_j(t) set. Used by :class:`RandTopMCautious` and :class:`RandTopMExtraCautious`, and by :class:`MCTopMCautious` and :class:`MCTopMExtraCautious`.
OPTIM_PICK_WORST_FIRST = True
OPTIM_PICK_WORST_FIRST = False


#: XXX Second experimental idea: when the currently chosen arm becomes the worst of the estimated Mbest set, leave it (even before it lies outside of Mbest_j(t)). Used by :class:`RandTopMExtraCautious` and :class:`MCTopMExtraCautious`.
OPTIM_EXIT_IF_WORST_WAS_PICKED = True
OPTIM_EXIT_IF_WORST_WAS_PICKED = False


#: XXX Third experimental idea: when the currently chosen arm becomes the worst of the estimated Mbest set, leave it (even before it lies outside of Mbest_j(t)). **Default now!**. `False` only for :class:`RandTopMOld` and :class:`MCTopMOld`.
OPTIM_PICK_PREV_WORST_FIRST = False
OPTIM_PICK_PREV_WORST_FIRST = True


# --- Class oneRandTopM, for children

[docs]class oneRandTopM(ChildPointer): """ Class that acts as a child policy, but in fact it pass all its method calls to the mother class, who passes it to its i-th player. - Except for the handleCollision method: a new random arm is sampled after observing a collision, - And the player does not aim at the best arm, but at one of the best arm, based on her index policy. - (See variants for more details.) """
[docs] def __init__(self, maxRank, withChair, pickWorstFirst, exitIfWorstWasPicked, pickPrevWorstFirst, *args, **kwargs): super(oneRandTopM, self).__init__(*args, **kwargs) # self.nbArms = self.mother._players[self.playerId].nbArms #: Number of arms self.maxRank = maxRank #: Max rank, usually nbPlayers but can be different. self.chosen_arm = None #: Current chosen arm. self._withChair = withChair # Whether to use or not the variant with the "chair". self._pickWorstFirst = pickWorstFirst # XXX Whether to use or not the variant with the "cautious choice". self._exitIfWorstWasPicked = exitIfWorstWasPicked # XXX Whether to use or not the second variant with the "even more cautious choice"... self._pickPrevWorstFirst = pickPrevWorstFirst # XXX Whether to use or not the third variant with the "smart and cautious choice"... self.sitted = False #: Not yet sitted. After 1 step without collision, don't react to collision (but still react when the chosen arm lies outside M-best). self.prevWorst = [] #: Keep track of the last arms worst than the chosen one (at previous time step). self.t = -1 #: Internal time
[docs] def __str__(self): # Better to recompute it automatically player = self.mother._players[self.playerId] Mbest_is_incorrect = self.t < self.nbArms or np.any(np.isinf(player.index)) or np.any(np.isnan(player.index)) str_Mbest = "" if Mbest_is_incorrect else r", $M$-best: ${}$".format(list(self.Mbest())) str_chosen_arm = r", arm: ${}$".format(self.chosen_arm) if self.chosen_arm is not None else "" # WARNING remember to change here when defining a new variant of RandTopM return r"#{}<{}TopM{}{}-{}{}{}>".format(self.playerId + 1, "MC" if self._withChair else "Rand", ("ExtraCautious" if self._exitIfWorstWasPicked else "Cautious") if self._pickWorstFirst else "", "" if self._pickPrevWorstFirst else "Old", player, str_Mbest, str_chosen_arm)
[docs] def startGame(self): """Start game.""" super(oneRandTopM, self).startGame() self.t = 0 self.sitted = False # Start not sitted, of course! self.chosen_arm = None
[docs] def Mbest(self): """ Current estimate of the M-best arms. M is the maxRank given to the algorithm.""" return self.estimatedBestArms(self.maxRank)
[docs] def worst_Mbest(self): """ Index of the worst of the current estimate of the M-best arms. M is the maxRank given to the algorithm.""" order = self.estimatedOrder() return order[-self.maxRank]
[docs] def worst_previous__and__current_Mbest(self, current_arm): r""" Return the set from which to select a random arm for :class:`MCTopM` (the optimization is now the default): .. math:: \hat{M}^j(t) \cap \{ m : g^j_m(t-1) \leq g^j_k(t-1) \}. """ current_Mbest = self.Mbest() prev_WorstThenChair = self.prevWorst if prev_WorstThenChair is None or len(prev_WorstThenChair) == 0: if self.t > 1: print("WARNING for the MCTopM variant using the 'pickPrevWorstFirst' optimization, it should ever find an empty set 'prev_WorstThenChair' ...") # DEBUG return current_Mbest else: intersection = np.intersect1d(current_Mbest, prev_WorstThenChair) if len(intersection) == 0: return current_Mbest return intersection
[docs] def handleCollision(self, arm, reward=None): """ Get a new random arm from the current estimate of Mbest, and give reward to the algorithm if not None.""" # RandTopM UCB indexes learn on the SENSING, not on the successful transmissions! if reward is not None: # print("Info: RandTopM UCB internal indexes DOES get updated by reward, in case of collision, learning is done on SENSING, not successful transmissions!") # DEBUG super(oneRandTopM, self).getReward(arm, reward) if not (self._withChair and self.sitted): self.chosen_arm = rn.choice(self.Mbest()) # New random arm
# print(" - A oneRandTopM player {} saw a collision on arm {}, so she had to select a new random arm {} from her estimate of M-best = {} ...".format(self, arm, self.chosen_arm, self.Mbest())) # DEBUG # else: # print(" - A oneRandTopM player {} saw a collision on arm {}, but she ignores it as she plays with a chair and is now sitted ...".format(self, arm)) # DEBUG
[docs] def getReward(self, arm, reward): """ Pass the call to self.mother._getReward_one(playerId, arm, reward) with the player's ID number. """ super(oneRandTopM, self).getReward(arm, reward) if self.t >= self.nbArms: if self._withChair: if not self.sitted: # print(" - A oneRandTopM player {} used this arm {} without any collision, so she is now sitted on this rank, and will not change in case of collision (but will change if the arm lies outside her estimate of M-best)...".format(self, self.chosen_arm)) # DEBUG self.sitted = True
# else: # print(" - A oneRandTopM player {} is already sitted on this arm {} ...".format(self, self.chosen_arm)) # DEBUG # else: # print(" - A oneRandTopM player {} is not playing with a chair, nothing to do ...".format(self) # DEBUG
[docs] def choice(self): """Reconsider the choice of arm, and then use the chosen arm. - For all variants, if the chosen arm is no longer in the current estimate of the Mbest set, a new one is selected, - The basic RandTopM selects uniformly an arm in estimate Mbest, - MCTopM starts by being *"non sitted"* on its new chosen arm, - MCTopMCautious is forced to first try the arm with *lowest* UCB indexes (or whatever index policy is used). """ if self.t < self.nbArms: # Force to sample each arm at least one self.chosen_arm = super(oneRandTopM, self).choice() else: # But now, trust the estimated set Mbest current_Mbest = self.Mbest() worst_current = self.worst_Mbest() # XXX optimization for MCTopMExtraCautious if (self.chosen_arm not in current_Mbest) or (self._exitIfWorstWasPicked and self.chosen_arm == worst_current): if self._withChair: self.sitted = False # old_arm = self.chosen_arm # XXX optimization for MCTopMCautious and MCTopMExtraCautious if self._pickWorstFirst: # aim at the worst of the current estimated arms self.chosen_arm = worst_current elif self._pickPrevWorstFirst: self.chosen_arm = rn.choice(self.worst_previous__and__current_Mbest(self.chosen_arm)) else: self.chosen_arm = rn.choice(current_Mbest) # New random arm # print("\n - A oneRandTopM player {} had chosen arm = {}, but it lied outside of M-best = {}, so she selected a new one = {} {}...".format(self, old_arm, current_Mbest, self.chosen_arm, "and is no longer sitted" if self._withChair else "but is not playing with a chair")) # DEBUG # Done self.t += 1 # XXX optimization for MCTopM if self._pickPrevWorstFirst: index = self._index() self.prevWorst = np.where(index <= index[self.chosen_arm])[0] return self.chosen_arm
[docs] def _index(self): """ Update and return the indexes of the underlying index policy.""" self.mother._players[self.playerId].computeAllIndex() return self.mother._players[self.playerId].index
# --- Class RandTopM
[docs]class RandTopM(BaseMPPolicy): """ RandTopM: a proposal for an efficient multi-players learning policy. """
[docs] def __init__(self, nbPlayers, nbArms, playerAlgo, withChair=WITH_CHAIR, pickWorstFirst=OPTIM_PICK_WORST_FIRST, exitIfWorstWasPicked=OPTIM_EXIT_IF_WORST_WAS_PICKED, pickPrevWorstFirst=OPTIM_PICK_PREV_WORST_FIRST, maxRank=None, lower=0., amplitude=1., *args, **kwargs): """ - nbPlayers: number of players to create (in self._players). - playerAlgo: class to use for every players. - nbArms: number of arms, given as first argument to playerAlgo. - withChair: see ``WITH_CHAIR``, - pickWorstFirst: see ``OPTIM_PICK_WORST_FIRST``, - exitIfWorstWasPicked: see ``EXIT_IF_WORST_WAS_PICKED``, - pickPrevWorstFirst: see ``OPTIM_PICK_PREV_WORST_FIRST``, - maxRank: maximum rank allowed by the RandTopM child (default to nbPlayers, but for instance if there is 2 × RandTopM[UCB] + 2 × RandTopM[klUCB], maxRank should be 4 not 2). - `*args`, `**kwargs`: arguments, named arguments, given to playerAlgo. Example: >>> from Policies import * >>> import random; random.seed(0); import numpy as np; np.random.seed(0) >>> nbArms = 17 >>> nbPlayers = 6 >>> s = RandTopM(nbPlayers, nbArms, UCB) >>> [ child.choice() for child in s.children ] [12, 15, 0, 3, 3, 7] - To get a list of usable players, use ``s.children``. .. warning:: ``s._players`` is for internal use ONLY! """ assert nbPlayers > 0, "Error, the parameter 'nbPlayers' for RandTopM class has to be > 0." # DEBUG if maxRank is None: maxRank = nbPlayers self.maxRank = maxRank #: Max rank, usually nbPlayers but can be different self.nbPlayers = nbPlayers #: Number of players self.withChair = withChair #: Using a chair ? self.pickWorstFirst = pickWorstFirst #: Using first optimization ? self.exitIfWorstWasPicked = exitIfWorstWasPicked #: Using second optimization ? self.pickPrevWorstFirst = pickPrevWorstFirst #: Using third optimization ? Default to yes now. self._players = [None] * nbPlayers self.children = [None] * nbPlayers #: List of children, fake algorithms self.nbArms = nbArms #: Number of arms for playerId in range(nbPlayers): self._players[playerId] = playerAlgo(nbArms, *args, **kwargs) self.children[playerId] = oneRandTopM(maxRank, withChair, pickWorstFirst, exitIfWorstWasPicked, pickPrevWorstFirst, self, playerId)
[docs] def __str__(self): return "RandTopM({} x {})".format(self.nbPlayers, str(self._players[0]))
[docs]class RandTopMCautious(RandTopM): """ RandTopMCautious: another proposal for an efficient multi-players learning policy, more "stationary" than RandTopM. .. warning:: Still very experimental! But it seems to be the most efficient decentralized MP algorithm we have so far... """
[docs] def __init__(self, nbPlayers, nbArms, playerAlgo, maxRank=None, lower=0., amplitude=1., *args, **kwargs): """ - nbPlayers: number of players to create (in self._players). - playerAlgo: class to use for every players. - nbArms: number of arms, given as first argument to playerAlgo. - maxRank: maximum rank allowed by the RandTopMCautious child (default to nbPlayers, but for instance if there is 2 × RandTopMCautious[UCB] + 2 × RandTopMCautious[klUCB], maxRank should be 4 not 2). - `*args`, `**kwargs`: arguments, named arguments, given to playerAlgo. Example: >>> from Policies import * >>> import random; random.seed(0); import numpy as np; np.random.seed(0) >>> nbArms = 17 >>> nbPlayers = 6 >>> s = RandTopMCautious(nbPlayers, nbArms, UCB) >>> [ child.choice() for child in s.children ] [12, 15, 0, 3, 3, 7] """ super(RandTopMCautious, self).__init__(nbPlayers, nbArms, playerAlgo, withChair=False, pickWorstFirst=True, exitIfWorstWasPicked=False, pickPrevWorstFirst=False, maxRank=maxRank, *args, **kwargs)
[docs] def __str__(self): return "RandTopMCautious({} x {})".format(self.nbPlayers, str(self._players[0]))
[docs]class RandTopMExtraCautious(RandTopM): """ RandTopMExtraCautious: another proposal for an efficient multi-players learning policy, more "stationary" than RandTopM. .. warning:: Still very experimental! But it seems to be the most efficient decentralized MP algorithm we have so far... """
[docs] def __init__(self, nbPlayers, nbArms, playerAlgo, maxRank=None, lower=0., amplitude=1., *args, **kwargs): """ - nbPlayers: number of players to create (in self._players). - playerAlgo: class to use for every players. - nbArms: number of arms, given as first argument to playerAlgo. - maxRank: maximum rank allowed by the RandTopMExtraCautious child (default to nbPlayers, but for instance if there is 2 × RandTopMExtraCautious[UCB] + 2 × RandTopMExtraCautious[klUCB], maxRank should be 4 not 2). - `*args`, `**kwargs`: arguments, named arguments, given to playerAlgo. Example: >>> from Policies import * >>> import random; random.seed(0); import numpy as np; np.random.seed(0) >>> nbArms = 17 >>> nbPlayers = 6 >>> s = RandTopMExtraCautious(nbPlayers, nbArms, UCB) >>> [ child.choice() for child in s.children ] [12, 15, 0, 3, 3, 7] """ super(RandTopMExtraCautious, self).__init__(nbPlayers, nbArms, playerAlgo, withChair=False, pickWorstFirst=True, exitIfWorstWasPicked=True, pickPrevWorstFirst=False, maxRank=maxRank, *args, **kwargs)
[docs] def __str__(self): return "RandTopMExtraCautious({} x {})".format(self.nbPlayers, str(self._players[0]))
[docs]class RandTopMOld(RandTopM): """ RandTopMOld: another proposal for an efficient multi-players learning policy, more "stationary" than RandTopM. """
[docs] def __init__(self, nbPlayers, nbArms, playerAlgo, maxRank=None, lower=0., amplitude=1., *args, **kwargs): """ - nbPlayers: number of players to create (in self._players). - playerAlgo: class to use for every players. - nbArms: number of arms, given as first argument to playerAlgo. - maxRank: maximum rank allowed by the RandTopMOld child (default to nbPlayers, but for instance if there is 2 × RandTopMOld[UCB] + 2 × RandTopMOld[klUCB], maxRank should be 4 not 2). - `*args`, `**kwargs`: arguments, named arguments, given to playerAlgo. Example: >>> from Policies import * >>> import random; random.seed(0); import numpy as np; np.random.seed(0) >>> nbArms = 17 >>> nbPlayers = 6 >>> s = RandTopMOld(nbPlayers, nbArms, UCB) >>> [ child.choice() for child in s.children ] [12, 15, 0, 3, 3, 7] """ super(RandTopMOld, self).__init__(nbPlayers, nbArms, playerAlgo, withChair=False, pickWorstFirst=False, exitIfWorstWasPicked=False, pickPrevWorstFirst=False, maxRank=maxRank, *args, **kwargs)
[docs] def __str__(self): return "RandTopMOld({} x {})".format(self.nbPlayers, str(self._players[0]))
[docs]class MCTopM(RandTopM): """ MCTopM: another proposal for an efficient multi-players learning policy, more "stationary" than RandTopM. .. warning:: Still very experimental! But it seems to be the most efficient decentralized MP algorithm we have so far... """
[docs] def __init__(self, nbPlayers, nbArms, playerAlgo, maxRank=None, lower=0., amplitude=1., *args, **kwargs): """ - nbPlayers: number of players to create (in self._players). - playerAlgo: class to use for every players. - nbArms: number of arms, given as first argument to playerAlgo. - maxRank: maximum rank allowed by the MCTopM child (default to nbPlayers, but for instance if there is 2 × MCTopM[UCB] + 2 × MCTopM[klUCB], maxRank should be 4 not 2). - `*args`, `**kwargs`: arguments, named arguments, given to playerAlgo. Example: >>> from Policies import * >>> import random; random.seed(0); import numpy as np; np.random.seed(0) >>> nbArms = 17 >>> nbPlayers = 6 >>> s = MCTopM(nbPlayers, nbArms, UCB) >>> [ child.choice() for child in s.children ] [12, 15, 0, 3, 3, 7] """ super(MCTopM, self).__init__(nbPlayers, nbArms, playerAlgo, withChair=True, pickWorstFirst=False, exitIfWorstWasPicked=False, pickPrevWorstFirst=True, maxRank=maxRank, *args, **kwargs)
[docs] def __str__(self): return "MCTopM({} x {})".format(self.nbPlayers, str(self._players[0]))
[docs]class MCTopMCautious(RandTopM): """ MCTopMCautious: another proposal for an efficient multi-players learning policy, more "stationary" than RandTopM. .. warning:: Still very experimental! But it seems to be the most efficient decentralized MP algorithm we have so far... """
[docs] def __init__(self, nbPlayers, nbArms, playerAlgo, maxRank=None, lower=0., amplitude=1., *args, **kwargs): """ - nbPlayers: number of players to create (in self._players). - playerAlgo: class to use for every players. - nbArms: number of arms, given as first argument to playerAlgo. - maxRank: maximum rank allowed by the MCTopMCautious child (default to nbPlayers, but for instance if there is 2 × MCTopMCautious[UCB] + 2 × MCTopMCautious[klUCB], maxRank should be 4 not 2). - `*args`, `**kwargs`: arguments, named arguments, given to playerAlgo. Example: >>> from Policies import * >>> import random; random.seed(0); import numpy as np; np.random.seed(0) >>> nbArms = 17 >>> nbPlayers = 6 >>> s = MCTopMCautious(nbPlayers, nbArms, UCB) >>> [ child.choice() for child in s.children ] [12, 15, 0, 3, 3, 7] """ super(MCTopMCautious, self).__init__(nbPlayers, nbArms, playerAlgo, withChair=True, pickWorstFirst=True, exitIfWorstWasPicked=False, pickPrevWorstFirst=False, maxRank=maxRank, *args, **kwargs)
[docs] def __str__(self): return "MCTopMCautious({} x {})".format(self.nbPlayers, str(self._players[0]))
[docs]class MCTopMExtraCautious(RandTopM): """ MCTopMExtraCautious: another proposal for an efficient multi-players learning policy, more "stationary" than RandTopM. .. warning:: Still very experimental! But it seems to be the most efficient decentralized MP algorithm we have so far... """
[docs] def __init__(self, nbPlayers, nbArms, playerAlgo, maxRank=None, lower=0., amplitude=1., *args, **kwargs): """ - nbPlayers: number of players to create (in self._players). - playerAlgo: class to use for every players. - nbArms: number of arms, given as first argument to playerAlgo. - maxRank: maximum rank allowed by the MCTopMExtraCautious child (default to nbPlayers, but for instance if there is 2 × MCTopMExtraCautious[UCB] + 2 × MCTopMExtraCautious[klUCB], maxRank should be 4 not 2). - `*args`, `**kwargs`: arguments, named arguments, given to playerAlgo. Example: >>> from Policies import * >>> import random; random.seed(0); import numpy as np; np.random.seed(0) >>> nbArms = 17 >>> nbPlayers = 6 >>> s = MCTopMExtraCautious(nbPlayers, nbArms, UCB) >>> [ child.choice() for child in s.children ] [12, 15, 0, 3, 3, 7] """ super(MCTopMExtraCautious, self).__init__(nbPlayers, nbArms, playerAlgo, withChair=True, pickWorstFirst=True, exitIfWorstWasPicked=True, pickPrevWorstFirst=False, maxRank=maxRank, *args, **kwargs)
[docs] def __str__(self): return "MCTopMExtraCautious({} x {})".format(self.nbPlayers, str(self._players[0]))
[docs]class MCTopMOld(RandTopM): """ MCTopMOld: another proposal for an efficient multi-players learning policy, more "stationary" than RandTopM. .. warning:: Still very experimental! But it seems to be one of the most efficient decentralized MP algorithm we have so far... The two other variants of MCTopM seem even better! """
[docs] def __init__(self, nbPlayers, nbArms, playerAlgo, maxRank=None, lower=0., amplitude=1., *args, **kwargs): """ - nbPlayers: number of players to create (in self._players). - playerAlgo: class to use for every players. - nbArms: number of arms, given as first argument to playerAlgo. - maxRank: maximum rank allowed by the MCTopMOld child (default to nbPlayers, but for instance if there is 2 × MCTopMOld[UCB] + 2 × MCTopMOld[klUCB], maxRank should be 4 not 2). - `*args`, `**kwargs`: arguments, named arguments, given to playerAlgo. Example: >>> from Policies import * >>> import random; random.seed(0); import numpy as np; np.random.seed(0) >>> nbArms = 17 >>> nbPlayers = 6 >>> s = MCTopMOld(nbPlayers, nbArms, UCB) >>> [ child.choice() for child in s.children ] [12, 15, 0, 3, 3, 7] """ super(MCTopMOld, self).__init__(nbPlayers, nbArms, playerAlgo, withChair=True, pickWorstFirst=False, exitIfWorstWasPicked=False, pickPrevWorstFirst=False, maxRank=maxRank, *args, **kwargs)
[docs] def __str__(self): return "MCTopMOld({} x {})".format(self.nbPlayers, str(self._players[0]))