Source code for Environment.MAB_rotting

"""
author : Julien SEZNEC
Code to launch (rotting) bandit games.
It is code in a functional programming way : each execution return arrays related to each run.
"""

import time
import numpy as np
import logging
from joblib import Parallel, delayed

REPETITIONS = 1000
HORIZON = 10000

[docs]def repetedRuns(policy, arms, rep = REPETITIONS, T = HORIZON, parallel = True, oracle = False):
    rew = np.empty(shape = (rep, T))
    noisy_rew = np.empty(shape = (rep, T))
    time = np.empty(shape = (rep, T))
    pulls = np.empty(shape=(rep, T))
    cumul_pulls = np.empty(shape=(rep, len(arms)))
    if parallel:
        res = Parallel(n_jobs=parallel)(delayed(singleRun)(policy,arms, T, r, oracle) for r in range(rep))
    else:
        res = [singleRun(policy,arms, T=T) for _ in range(rep)]
    rew[:, :] =  np.array([r['cumul'] for r in res ])
    noisy_rew[:, :] = np.array([r['noisy_cumul'] for r in res])
    time[:, :] = np.array([r['time'] for r in res ])
    pulls[:,:] = np.array([r['pulls'] for r in res ])
    cumul_pulls[:,:] = np.array([r['cumul_pulls'] for r in res ])
    return rew, noisy_rew, time, pulls, cumul_pulls

[docs]def singleRun(policy, arms, T = HORIZON,rep_index = 0, oracle=False):
    myArms = [arm[0](**arm[1]) for arm in arms]
    if oracle:
        policy[1]['arms'] = myArms
    myPolicy = policy[0](len(myArms), **policy[1])
    myPolicy.startGame()
    logging.debug(str(rep_index) + ' ' + myPolicy.__str__())
    res = play(myArms, myPolicy, T, Oracle=oracle)
    return {
      'cumul': np.array(res['rewards']).cumsum(),
      'noisy_cumul': np.array(res['noisy_rewards']),
      'time' : np.array(res['time']),
      'pulls' : np.array(res['pulls']),
      'cumul_pulls' : np.array(res['cumul_pulls'])
    }


[docs]def play(arms, policy, T, Oracle= False):
    noisy_rewards = []
    rewards = []
    times = []
    pulls = []
    cumul_pulls = [0 for _ in range(len(arms))]
    for t in range(T):
        start = time.time()
        choice = policy.choice()
        reward = arms[choice].mean
        noisy_reward = arms[choice].draw(t) if not Oracle else arms[choice].oracle_draw(t)
        policy.getReward(choice, noisy_reward)
        times.append(time.time() - start)
        noisy_rewards.append(noisy_reward)
        rewards.append(reward)
        pulls.append(choice)
        cumul_pulls[choice] += 1
    return {'rewards': rewards, 'noisy_rewards': noisy_rewards, 'time': times, 'pulls': pulls, 'cumul_pulls' : cumul_pulls}