# -*- coding: utf-8 -*-
r""" RandTopMEstEst: four proposals for an efficient multi-players learning policy. :class:`RandTopMEstEst` and :class:`MCTopMEstEst` are the two main algorithms, with variants (see below).
- Each child player is selfish, and plays according to an index policy (any index policy, e.g., UCB, Thompson, KL-UCB, BayesUCB etc),
- But instead of aiming at the best (the 1-st best) arm, player i constantly aims at *one* of the M best arms (denoted :math:`\hat{M}^j(t)`, according to its index policy of indexes :math:`g^j_k(t)` (where M is the number of players),
- When a collision occurs or when the currently chosen arm lies outside of the current estimate of the set M-best, a new current arm is chosen.
- The (fixed) number of players is learned on the run.
.. note:: This is **fully decentralized**: player do not need to know the (fixed) number of players!
- Reference: [[Multi-Player Bandits Revisited, Lilian Besson and Emilie Kaufmann, 2017]](https://hal.inria.fr/hal-01629733)
.. warning:: This is still very experimental!
.. note:: For a more generic approach, see the wrapper defined in :class:`EstimateM.EstimateM`.
"""
from __future__ import division, print_function # Python 2 compatibility
__author__ = "Lilian Besson"
__version__ = "0.8"
import numpy as np
try:
from .BaseMPPolicy import BaseMPPolicy
from .ChildPointer import ChildPointer
from .RandTopM import oneRandTopM
from .EstimateM import threshold_on_t_with_horizon, threshold_on_t_doubling_trick, threshold_on_t
except ImportError:
from BaseMPPolicy import BaseMPPolicy
from ChildPointer import ChildPointer
from RandTopM import oneRandTopM
from EstimateM import threshold_on_t_with_horizon, threshold_on_t_doubling_trick, threshold_on_t
[docs]class oneRandTopMEst(oneRandTopM):
r""" Class that acts as a child policy, but in fact it pass all its method calls to the mother class, who passes it to its i-th player.
- The procedure to estimate :math:`\hat{M}_i(t)` is not so simple, but basically everyone starts with :math:`\hat{M}_i(0) = 1`, and when colliding :math:`\hat{M}_i(t+1) = \hat{M}_i(t) + 1`, for some time (with a complicated threshold).
"""
[docs] def __init__(self, threshold, *args, **kwargs):
if 'horizon' in kwargs:
self.horizon = kwargs['horizon']
del kwargs['horizon']
else:
self.horizon = None
super(oneRandTopMEst, self).__init__(*args, **kwargs)
# Parameters
self.maxRank = 1
self.threshold = threshold #: Threshold function
# Internal variables
self.nbPlayersEstimate = 1 #: Number of players. Optimistic: start by assuming it is alone!
self.collisionCount = np.zeros(self.nbArms, dtype=int) #: Count collisions on each arm, since last increase of nbPlayersEstimate
self.timeSinceLastCollision = 0 #: Time since last collision. Don't remember why I thought using this could be useful... But it's not!
self.t = 0 #: Internal time
[docs] def __str__(self): # Better to recompute it automatically
player = self.mother._players[self.playerId]
Mbest_is_incorrect = self.t < self.nbArms or np.any(np.isinf(player.index)) or np.any(np.isnan(player.index))
str_Mbest = "" if Mbest_is_incorrect else r", $M$-best: ${}$".format(list(self.Mbest()))
str_chosen_arm = r", arm: ${}$".format(self.chosen_arm) if self.chosen_arm is not None else ""
# WARNING remember to change here when defining a new variant of RandTopM
return r"#{}<{}TopM{}{}-Est{}-{}{}{}{}>".format(self.playerId + 1, "MC" if self._withChair else "Rand", ("ExtraCautious" if self._exitIfWorstWasPicked else "Cautious") if self._pickWorstFirst else "", "" if self._pickPrevWorstFirst else "Old", "Plus" if self.horizon else "", player, str_Mbest, str_chosen_arm, "($T={}$)".format(self.horizon) if self.horizon else "")
[docs] def startGame(self):
"""Start game."""
super(oneRandTopMEst, self).startGame()
# Est part
self.nbPlayersEstimate = self.maxRank = 1 # Optimistic: start by assuming it is alone!
self.collisionCount.fill(0)
self.timeSinceLastCollision = 0
self.t = 0
[docs] def handleCollision(self, arm, reward=None):
"""Select a new rank, and maybe update nbPlayersEstimate."""
super(oneRandTopMEst, self).handleCollision(arm, reward=reward)
# we can be smart, and stop all this as soon as M = K !
if self.nbPlayersEstimate < self.nbArms:
self.collisionCount[arm] += 1
# print("\n - A oneRandTopMEst player {} saw a collision on {}, since last update of nbPlayersEstimate = {} it is the {} th collision on that arm {}...".format(self, arm, self.nbPlayersEstimate, self.collisionCount[arm], arm)) # DEBUG
# Then, estimate the current ranking of the arms and the set of the M best arms
currentBest = self.estimatedBestArms(self.nbPlayersEstimate)
# print("Current estimation of the {} best arms is {} ...".format(self.nbPlayersEstimate, currentBest)) # DEBUG
collisionCount_on_currentBest = np.sum(self.collisionCount[currentBest])
# print("Current count of collision on the {} best arms is {} ...".format(self.nbPlayersEstimate, collisionCount_on_currentBest)) # DEBUG
# And finally, compare the collision count with the current threshold
threshold = self.threshold(self.t, self.nbPlayersEstimate, self.horizon)
# print("Using timeSinceLastCollision = {}, and t = {}, threshold = {:.3g} ...".format(self.timeSinceLastCollision, self.t, threshold))
if collisionCount_on_currentBest > threshold:
self.nbPlayersEstimate = self.maxRank = min(1 + self.nbPlayersEstimate, self.nbArms)
# print("The collision count {} was larger than the threshold {:.3g} se we restart the collision count, and increase the nbPlayersEstimate to {}.".format(collisionCount_on_currentBest, threshold, self.nbPlayersEstimate)) # DEBUG
self.collisionCount.fill(0)
# Finally, restart timeSinceLastCollision
self.timeSinceLastCollision = 0
[docs] def getReward(self, arm, reward):
"""One transmission without collision."""
self.t += 1
# Obtaining a reward, even 0, means no collision on that arm for this time
# So, first, we count one more step without collision
self.timeSinceLastCollision += 1
# print("Time since last collision = {} ...".format(self.timeSinceLastCollision)) # DEBUG
# Then use the reward for the arm learning algorithm
return super(oneRandTopMEst, self).getReward(arm, reward)
# --- Class RandTopMEst
#: Whether to use or not the variant with the "chair": after using an arm successfully (no collision), a player won't move after future collisions (she assumes the other will move). But she will still change her chosen arm if it lies outside of the estimated M-best. :class:`RandTopMEst` (and variants) uses `False` and :class:`MCTopMEst` (and variants) uses `True`.
WITH_CHAIR = True
WITH_CHAIR = False
#: XXX First experimental idea: when the currently chosen arm lies outside of the estimated Mbest set, force to first try (at least once) the arm with lowest UCB indexes in this Mbest_j(t) set. Used by :class:`RandTopMEstCautious` and :class:`RandTopMEstExtraCautious`, and by :class:`MCTopMEstCautious` and :class:`MCTopMEstExtraCautious`.
OPTIM_PICK_WORST_FIRST = True
OPTIM_PICK_WORST_FIRST = False
#: XXX Second experimental idea: when the currently chosen arm becomes the worst of the estimated Mbest set, leave it (even before it lies outside of Mbest_j(t)). Used by :class:`RandTopMEstExtraCautious` and :class:`MCTopMEstExtraCautious`.
OPTIM_EXIT_IF_WORST_WAS_PICKED = True
OPTIM_EXIT_IF_WORST_WAS_PICKED = False
#: XXX Third experimental idea: when the currently chosen arm becomes the worst of the estimated Mbest set, leave it (even before it lies outside of Mbest_j(t)). **Default now!**. `False` only for :class:`RandTopMEstOld` and :class:`MCTopMEstOld`.
OPTIM_PICK_PREV_WORST_FIRST = False
OPTIM_PICK_PREV_WORST_FIRST = True
[docs]class RandTopMEst(BaseMPPolicy):
""" RandTopMEst: a proposal for an efficient multi-players learning policy, with no prior knowledge of the number of player.
"""
[docs] def __init__(self, nbPlayers, nbArms, playerAlgo,
withChair=WITH_CHAIR,
pickWorstFirst=OPTIM_PICK_WORST_FIRST,
exitIfWorstWasPicked=OPTIM_EXIT_IF_WORST_WAS_PICKED,
pickPrevWorstFirst=OPTIM_PICK_PREV_WORST_FIRST,
threshold=threshold_on_t_doubling_trick, lower=0., amplitude=1.,
*args, **kwargs):
"""
- nbPlayers: number of players to create (in self._players).
- playerAlgo: class to use for every players.
- nbArms: number of arms, given as first argument to playerAlgo.
- withChair: see ``WITH_CHAIR``,
- pickWorstFirst: see ``OPTIM_PICK_WORST_FIRST``,
- exitIfWorstWasPicked: see ``EXIT_IF_WORST_WAS_PICKED``,
- pickPrevWorstFirst: see ``OPTIM_PICK_PREV_WORST_FIRST``,
- threshold: the threshold function to use, see :func:`EstimateM.threshold_on_t_with_horizon`, :func:`EstimateM.threshold_on_t_doubling_trick` or :func:`EstimateM.threshold_on_t` above.
- `*args`, `**kwargs`: arguments, named arguments, given to playerAlgo.
Example:
>>> from Policies import *
>>> import random; random.seed(0); import numpy as np; np.random.seed(0)
>>> nbArms = 17
>>> nbPlayers = 6
>>> s = RandTopMEst(nbPlayers, nbArms, UCB)
>>> [ child.choice() for child in s.children ]
[12, 15, 0, 3, 3, 7]
- To get a list of usable players, use ``s.children``.
.. warning:: ``s._players`` is for internal use ONLY!
"""
assert nbPlayers > 0, "Error, the parameter 'nbPlayers' for RandTopMEst class has to be > 0." # DEBUG
self.nbPlayers = nbPlayers #: Number of players
self.withChair = withChair #: Using a chair ?
self.pickWorstFirst = pickWorstFirst #: Using first optimization ?
self.exitIfWorstWasPicked = exitIfWorstWasPicked #: Using second optimization ?
self.pickPrevWorstFirst = pickPrevWorstFirst #: Using third optimization ? Default to yes now.
self._players = [None] * nbPlayers
self.children = [None] * nbPlayers #: List of children, fake algorithms
self.nbArms = nbArms #: Number of arms
fake_maxRank = None
for playerId in range(nbPlayers):
self._players[playerId] = playerAlgo(nbArms, *args, **kwargs)
self.children[playerId] = oneRandTopMEst(threshold, fake_maxRank, withChair, pickWorstFirst, exitIfWorstWasPicked, pickPrevWorstFirst, self, playerId)
[docs] def __str__(self):
return "RandTopMEst({} x {})".format(self.nbPlayers, str(self._players[0]))
[docs]class RandTopMEstPlus(BaseMPPolicy):
""" RandTopMEstPlus: a proposal for an efficient multi-players learning policy, with no prior knowledge of the number of player.
"""
[docs] def __init__(self, nbPlayers, nbArms, playerAlgo, horizon,
withChair=WITH_CHAIR,
pickWorstFirst=OPTIM_PICK_WORST_FIRST,
exitIfWorstWasPicked=OPTIM_EXIT_IF_WORST_WAS_PICKED,
pickPrevWorstFirst=OPTIM_PICK_PREV_WORST_FIRST,
lower=0., amplitude=1.,
*args, **kwargs):
"""
- nbPlayers: number of players to create (in self._players).
- playerAlgo: class to use for every players.
- nbArms: number of arms, given as first argument to playerAlgo.
- horizon: need to know the horizon :math:`T`.
- withChair: see ``WITH_CHAIR``,
- pickWorstFirst: see ``OPTIM_PICK_WORST_FIRST``,
- exitIfWorstWasPicked: see ``EXIT_IF_WORST_WAS_PICKED``,
- pickPrevWorstFirst: see ``OPTIM_PICK_PREV_WORST_FIRST``,
- threshold: the threshold function to use, see :func:`threshold_on_t_with_horizon` or :func:`threshold_on_t` above.
- `*args`, `**kwargs`: arguments, named arguments, given to playerAlgo.
Example:
>>> from Policies import *
>>> import random; random.seed(0); import numpy as np; np.random.seed(0)
>>> nbArms = 17
>>> nbPlayers = 6
>>> horizon = 1000
>>> s = RandTopMEstPlus(nbPlayers, nbArms, UCB, horizon)
>>> [ child.choice() for child in s.children ]
[12, 15, 0, 3, 3, 7]
- To get a list of usable players, use ``s.children``.
.. warning:: ``s._players`` is for internal use ONLY!
"""
assert nbPlayers > 0, "Error, the parameter 'nbPlayers' for RandTopMEstPlus class has to be > 0." # DEBUG
self.nbPlayers = nbPlayers #: Number of players
self.withChair = withChair #: Using a chair ?
self.pickWorstFirst = pickWorstFirst #: Using first optimization ?
self.exitIfWorstWasPicked = exitIfWorstWasPicked #: Using second optimization ?
self.pickPrevWorstFirst = pickPrevWorstFirst #: Using third optimization ? Default to yes now.
self._players = [None] * nbPlayers
self.children = [None] * nbPlayers #: List of children, fake algorithms
self.nbArms = nbArms #: Number of arms
fake_maxRank = None
for playerId in range(nbPlayers):
self._players[playerId] = playerAlgo(nbArms, *args, **kwargs)
self.children[playerId] = oneRandTopMEst(threshold_on_t_with_horizon, fake_maxRank, withChair, pickWorstFirst, exitIfWorstWasPicked, pickPrevWorstFirst, self, playerId, horizon=horizon)
[docs] def __str__(self):
return "RandTopMEstPlus({} x {})".format(self.nbPlayers, str(self._players[0]))
[docs]class MCTopMEst(RandTopMEst):
""" MCTopMEst: another proposal for an efficient multi-players learning policy, more "stationary" than RandTopMEst.
.. warning:: Still very experimental! But it seems to be the most efficient decentralized MP algorithm we have so far...
"""
[docs] def __init__(self, nbPlayers, nbArms, playerAlgo,
lower=0., amplitude=1.,
*args, **kwargs):
"""
- nbPlayers: number of players to create (in self._players).
- playerAlgo: class to use for every players.
- nbArms: number of arms, given as first argument to playerAlgo.
- `*args`, `**kwargs`: arguments, named arguments, given to playerAlgo.
"""
super(MCTopMEst, self).__init__(nbPlayers, nbArms, playerAlgo, withChair=True, pickWorstFirst=False, exitIfWorstWasPicked=False, pickPrevWorstFirst=True, *args, **kwargs)
[docs] def __str__(self):
return "MCTopMEst({} x {})".format(self.nbPlayers, str(self._players[0]))
[docs]class MCTopMEstPlus(RandTopMEstPlus):
""" MCTopMEstPlus: another proposal for an efficient multi-players learning policy, more "stationary" than RandTopMEst.
.. warning:: Still very experimental! But it seems to be the most efficient decentralized MP algorithm we have so far...
"""
[docs] def __init__(self, nbPlayers, nbArms, playerAlgo, horizon,
lower=0., amplitude=1.,
*args, **kwargs):
"""
- nbPlayers: number of players to create (in self._players).
- playerAlgo: class to use for every players.
- nbArms: number of arms, given as first argument to playerAlgo.
- `*args`, `**kwargs`: arguments, named arguments, given to playerAlgo.
"""
super(MCTopMEstPlus, self).__init__(nbPlayers, nbArms, playerAlgo, horizon, withChair=True, pickWorstFirst=False, exitIfWorstWasPicked=False, pickPrevWorstFirst=True, *args, **kwargs)
[docs] def __str__(self):
return "MCTopMEstPlus({} x {})".format(self.nbPlayers, str(self._players[0]))