Source code for Policies.Experimentals.BlackBoxOpt

# -*- coding: utf-8 -*-
r""" An experimental "on-line" policy, using algorithms from black-box Bayesian optimization, using [scikit-optimize](https://scikit-optimize.github.io/).

- It uses an iterative black-box Bayesian optimizer, with two methods :meth:`ask` and :meth:`tell` to be used as :meth:`choice` and :meth:`getReward` for our Multi-Armed Bandit optimization environment.
- See https://scikit-optimize.github.io/notebooks/ask-and-tell.html for more details.

.. warning:: This is still **experimental**! It is NOT efficient in terms of storage, and **highly** NOT efficient either in terms of efficiency against a Bandit problem (i.e., regret, best arm identification etc).
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"

# WARNING: this is a HUGE hack to fix a mystery bug on importing this policy
from sys import path
from os.path import dirname
path.insert(0, '/'.join(dirname(__file__).split('/')[:-1]))
import numpy as np

# Ignore the UserWarning skopt/optimizer/optimizer.py:208:
# UserWarning: The objective has been evaluated at this point before.
from warnings import filterwarnings
# simplefilter("ignore", UserWarning)
filterwarnings("ignore", message="The objective has been evaluated at this point before", category=UserWarning)

# Cf. https://scikit-optimize.github.io/
try:
    import skopt.learning
    from skopt import Optimizer
except ImportError as e:
    print("ERROR: the 'scikit-optimize' package is mandatory for BlackBoxOpt policy.\nInstall it with 'pip install skopt' or 'sudo pip install skopt' (or maybe 'pip3').")  # WARNING
    raise e

try:
    from .BasePolicy import BasePolicy
except ImportError:
    from BasePolicy import BasePolicy


# --- Default estimator and optimizer

[docs]def default_estimator(*args, **kwargs):
    """Default estimator object.

    - Default is :class:`RandomForestRegressor` (https://scikit-optimize.github.io/learning/index.html#skopt.learning.RandomForestRegressor).
    - Another possibility is to use :class:`ExtraTreesRegressor` (https://scikit-optimize.github.io/learning/index.html#skopt.learning.ExtraTreesRegressor), but it is slower!
    - :class:`GaussianProcessRegressor` (https://scikit-optimize.github.io/learning/index.html#skopt.learning.GaussianProcessRegressor) was failing, don't really know why. I think it is not designed to work with Categorical inputs.
    - Any of https://scikit-optimize.github.io/learning/index.html can be used.
    """
    etr = skopt.learning.RandomForestRegressor(*args, **kwargs)
    # etr = skopt.learning.ExtraTreesRegressor(*args, **kwargs)
    # etr = skopt.learning.GaussianProcessRegressor(*args, **kwargs)
    return etr


[docs]def default_optimizer(nbArms, est, *args, **kwargs):
    """Default optimizer object.

    - Default is :class:`Optimizer` (https://scikit-optimize.github.io/#skopt.Optimizer).
    """
    opt = Optimizer([
                    list(range(nbArms))  # Categorical dimensions: arm index!
                    ],
                    est(*args, **kwargs),
                    acq_optimizer="sampling",
                    n_random_starts=3 * nbArms  # Sure ?
                    )
    return opt


# --- Decision Making Policy

[docs]class BlackBoxOpt(BasePolicy):
    r"""Black-box Bayesian optimizer for Multi-Armed Bandit, using Gaussian processes.

    - By default, it uses :func:`default_optimizer`.

    .. warning:: This is still **experimental**! It works fine, but it is EXTREMELY SLOW!
    """

[docs]    def __init__(self, nbArms,
                 opt=default_optimizer, est=default_estimator,
                 lower=0., amplitude=1.,  # not used, but needed for my framework
                 *args, **kwargs):
        self.nbArms = nbArms  #: Number of arms of the MAB problem.
        self.t = 0  #: Current time.
        # Black-box optimizer
        self._opt = opt  # Store it
        self._est = est  # Store it
        self._args = args  # Other non-kwargs args given to the estimator.
        self._kwargs = kwargs  # Other kwargs given to the estimator.
        self.opt = opt(nbArms, est, *args, **kwargs)  #: The black-box optimizer to use, initialized from the other arguments
        # Other attributes
        self.lower = lower  #: Known lower bounds on the rewards.
        self.amplitude = amplitude  #: Known amplitude of the rewards.

    # --- Easy methods

[docs]    def __str__(self):
        return "BlackBoxOpt({}, {})".format(self._opt.__name__, self._est.__name__)

[docs]    def startGame(self):
        """ Reinitialize the black-box optimizer."""
        self.t = 0
        self.opt = self._opt(self.nbArms, self._est, *self._args, **self._kwargs)  # The black-box optimizer to use, initialized from the other arguments

[docs]    def getReward(self, armId, reward):
        """ Store this observation `reward` for that arm `armId`.

        - In fact, :class:`skopt.Optimizer` is a *minimizer*, so `loss=1-reward` is stored, to maximize the rewards by minimizing the losses.
        """
        reward = (reward - self.lower) / self.amplitude  # project the reward to [0, 1]
        loss = 1. - reward  # flip
        # print("- A {} policy saw a reward = {} (= loss = {}) from arm = {}...".format(self, reward, loss, armId))  # DEBUG
        return self.opt.tell([armId], loss)

[docs]    def choice(self):
        r""" Choose an arm, according to the black-box optimizer."""
        self.t += 1
        asked = self.opt.ask()
        # That's a np.array of int, as we use Categorical input dimension!
        arm = int(np.round(asked[0]))
        # print("- At time t = {}, a {} policy chose to play arm = {}...".format(self, self.t, arm))  # DEBUG
        return arm