import numpy as np
import random
import math
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import matplotlib.image as mpimg
from IPython import display
from IPython.display import HTML
random.seed(0)
# Choix du nombre machine à sous
K = 3

# Génération des paramètres des lois de Bernoulli
# p = np.random.uniform(0,1,K)

# Ou définition du problème de bandit à K bras que l'on souhaire résoudre
p = np.array([0.5,0.2,0.6])
print(p)

# Nombre de coups à jouer 
T = 100

# Matrice qui contiendra les gains de chaque machine au cours des coups joués
G = np.zeros([K, T])

# Vecteur qui contiendra les numéros des machines jouées 
# (par convention python la première est machine est numérotéé 0)
It = -1*np.ones(T)

# pour chaque coup
for t in range(T):
    # la statisticienne choisit la machine à sous it de manière aléatoire 
    it = math.floor(random.uniform(0,K))
    # it est stocké dans le vecteur It
    It[t] = it 
    # elle reçoit le gain gt : il est tiré selon la loi de Bernoulli de paramètre p[it]
    gt = random.binomialvariate(1,p[it])
    # pour chaque machine à sous k
    for k in range(K):
        # si k est la machiné jouée, alors on met à jour le gain associé à la machine k
        if k == it:
            G[k,t] = G[k,t-1] + gt
        # si k n'est pas la machiné jouée, le gain associé à la machine k reste le même
        else:
            G[k,t] = G[k,t-1]

[0.5 0.2 0.6]

# Code pour générer l'animation

let = mpimg.imread('let.png')
pick = mpimg.imread('pick.png')
white = mpimg.imread('white.png')
end = mpimg.imread('end.png')

def JouerBandit(frame): 
    if frame == T:
        for k in range(K):
            im = white
            axs[k].imshow(im)
            axs[k].axis("off")
            axs[k].set_title("", y=-K/10)
        axs[math.floor(K/2)].imshow(end)
        axs[math.floor(K/2)].axis("off")
        axs[math.floor(K/2)].set_title("Gain = %i" %np.sum(G[:, T-1]), y=-K/10, 
                                       fontsize=20, color= "#01A9B1")
    else:
        it = It[frame]
        for k in range(K):
            if(k == it):
                im = pick
            else:
                im = let
            axs[k].imshow(im)
            axs[k].axis("off")
            axs[k].set_title("%i" %G[k, frame], y=-K/10)
   
f, axs = plt.subplots(1,K)    
anim_created = animation.FuncAnimation(f,JouerBandit, frames=T+1, interval=300)
video = anim_created.to_jshtml()
html = display.HTML(video)
display.display(html) 
plt.close()

# Explore then commit 

# Matrice de taille K x T qui contiendra les gains de chaque machine au cours des coups joués
G_etc = np.zeros([K, T])

# Vecteur de taille T qui contiendra les numéros des machines jouées 
It_etc  = -1*np.ones(T)

# Vecteur de taille K qui contiendra le nombre de fois que l'on a joué chaque machine 
N = np.zeros([K])

# proportion du temps où l'on explorera 
prop_explore = 0.2

# calcul de T_explore en fonction 
T_explore = math.floor(prop_explore*T/(K))

# Algorithme Explore then commit : 
for t in range(T):
    # explore
    if t<(K*T_explore):
        it = math.floor(t/T_explore)

    #commit 
    else:
        it = np.argmax(G_etc[:,K*T_explore-1])
    gt = random.binomialvariate(1,p[it])
    It_etc[t] = it 
    for k in range(K):
        if k == it:
            G_etc[k,t] = G_etc[k,t-1] + gt
            N[k] = N[k] + 1
        else:
            G_etc[k,t] = G_etc[k,t-1] 
            
def JouerBanditETC(frame): 
    if frame == T:
        for k in range(K):
            im = white
            axs[k].imshow(im)
            axs[k].axis("off")
            axs[k].set_title("", y=-K/10)
        axs[math.floor(K/2)].imshow(end)
        axs[math.floor(K/2)].axis("off")
        axs[math.floor(K/2)].set_title("Gain = %i" %np.sum(G_etc[:, T-1]), y=-K/10, 
                                       fontsize=20, color= "#01A9B1")
    else:
        it = It_etc[frame]
        for k in range(K):
            if(k == it):
                im = pick
            else:
                im = let
            axs[k].imshow(im)
            axs[k].axis("off")
            axs[k].set_title("%i" %G_etc[k, frame], y=-K/10)

f, axs = plt.subplots(1,K)    
anim_created = animation.FuncAnimation(f,JouerBanditETC, frames=T+1, interval=300)
video = anim_created.to_jshtml()
html = display.HTML(video)
display.display(html) 
plt.close()

G_ucb = np.zeros([K, T])
It_ucb = -1*np.ones(T)
ucb = np.zeros([K, T])
N = np.zeros([K,T])

for t in range(T):
    if t<K:
        it = t
    else:
        it = np.argmax(ucb[:,t-1])
    gt = random.binomialvariate(1,p[it])
    It_ucb[t] = it 
    for k in range(K):
        if k == it:
            G_ucb[k,t] = G_ucb[k,t-1] + gt
            if t == 1:
                N[k,t] = 1*(k == it)
            else:
                N[k,t] = N[k,t-1] + 1
        else:
            G_ucb[k,t] = G_ucb[k,t-1] 
            N[k,t] = N[k,t-1] 
        ucb[k,t] = G_ucb[k,t]/max(N[k, t],1) + math.sqrt(2*math.log(t+1)/max(N[k,t],1))

def PlotUCB(t): 
    ylm = -0.3
    ylM = 3
    plt.cla()   
    plt.clf()
    plt.ylim((ylm,ylM))
    plt.xlim((-0.4,K-0.6))
    it = It_ucb[t]
    for k in range(K):
        if k == it:
            c = "#01A9B1"
        else:
            c = '#22509D'
        plt.axvline(x= k , 
                ymin= (G_ucb[k,t]/max(N[k, t],1) - math.sqrt(2*math.log(max(t+1,2))/max(N[k,t],1)))/(ylM-ylm)-ylm,
                ymax= (G_ucb[k,t]/max(N[k, t],1) + math.sqrt(2*math.log(max(t+1,2))/max(N[k,t],1)))/(ylM-ylm)-ylm,
                color=c, label='Machine 1',
                marker = 'x', markersize=7)
        plt.plot(k,(G_ucb[k,t]/max(N[k, t],1)),c,marker='o') 
        plt.plot(k,p[k],'k',marker='*', markersize=7) 
    plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False)


f, ax1 = plt.subplots(1,1)
anim_created = animation.FuncAnimation(f,PlotUCB, frames=T, interval=300)
video = anim_created.to_jshtml()
html = display.HTML(video)
display.display(html) 
plt.close()

def JouerBanditUCB(frame): 
    if frame == T:
        for k in range(K):
            im = white
            axs[k].imshow(im)
            axs[k].axis("off")
            axs[k].set_title("", y=-K/10)
        axs[math.floor(K/2)].imshow(end)
        axs[math.floor(K/2)].axis("off")
        axs[math.floor(K/2)].set_title("Gain = %i" %np.sum(G_ucb[:, T-1]), y=-K/10, 
                                       fontsize=20, color= "#01A9B1")
    else:
        it = It_ucb[frame]
        for k in range(K):
            if(k == it):
                im = pick
            else:
                im = let
            axs[k].imshow(im)
            axs[k].axis("off")
            axs[k].set_title("%i" %G_ucb[k, frame], y=-K/10)

f, axs = plt.subplots(1,K)    
anim_created = animation.FuncAnimation(f,JouerBanditUCB, frames=T+1, interval=300)
video = anim_created.to_jshtml()
html = display.HTML(video)
display.display(html) 
plt.close()

n = 200
T_explore = math.floor(prop_explore*T/(K))

res_rd = np.zeros([n, T])
res_etc = np.zeros([n, T])
res_ucb = np.zeros([n, T])

# pour chaque simulation i 
for i in range(n):
    
    # Simulation des gains pour chaque machine
    G = np.zeros([K, T])
    for k in range(K):
        for t in range(T):
            G[k,t] = random.binomialvariate(1,p[k])
            
    # Comparaison des gains de chaque stratégie
    G_rd = np.zeros([K, T])
    G_etc = np.zeros([K, T])
    G_ucb = np.zeros([K, T])
    ucb = np.zeros([K, T])
    N_ucb = np.zeros([K])
    for t in range(T):
        
        # Choix de la machine 
        # Aléatoire
        it_rd = math.floor(random.uniform(0,K))
        gt_rd = random.binomialvariate(1,p[it_rd])
        # Explore then commit 
        if t<(K*T_explore):
            it_etc = math.floor(t/T_explore)
        else:
            it_etc = np.argmax(G_etc[:,K*T_explore-1])
        # UCB
        if t<K:
            it_ucb = t
        else:
            it_ucb = np.argmax(ucb[:,t-1])
        
        # Mise à jour des gains de chaque stratégie    
        for k in range(K):
            if k == it_rd:
                G_rd[k,t] = G_rd[k,t-1] + G[k,t]
            else:
                G_rd[k,t] = G_rd[k,t-1] 
            if k == it_etc:
                G_etc[k,t] = G_etc[k,t-1] + G[k,t]
            else:
                G_etc[k,t] = G_etc[k,t-1] 
            if k == it_ucb:
                G_ucb[k,t] = G_ucb[k,t-1] + G[k,t]
                N_ucb[k] = N_ucb[k] + 1
            else:
                G_ucb[k,t] = G_ucb[k,t-1] 
            ucb[k,t] = G_ucb[k,t]/max(N_ucb[k],1) + math.sqrt(2*math.log(t+1)/max(N_ucb[k],1))
    res_rd[i, :] = np.sum(G_rd, axis=0)
    res_etc[i, :] = np.sum(G_etc, axis=0)
    res_ucb[i, :] = np.sum(G_ucb, axis=0)

x = np.linspace(0, T, T)
y_rd = np.mean(res_rd, axis=0)
y_etc = np.mean(res_etc, axis=0)
y_ucb = np.mean(res_ucb, axis=0)

sd_rd = np.std(res_rd, axis=0)
sd_etc = np.std(res_etc, axis=0)
sd_ucb = np.std(res_ucb, axis=0)

fig, ax = plt.subplots()
ax.plot(x, y_rd, label="Aléatoire",color= "#223F6A")
ax.fill_between(x, y_rd + sd_rd, y_rd - sd_rd, color= "#223F6A", alpha = 0.5)
ax.plot(x, y_etc, label="Explore then commit",color= "#5FCC9E")
ax.fill_between(x, y_etc + sd_etc, y_etc - sd_etc, color= "#5FCC9E", alpha = 0.5)
ax.plot(x, y_ucb, label="Upper Confidence Bound",color= "#326DC0")
ax.fill_between(x, y_ucb + sd_ucb, y_ucb - sd_ucb, color= "#326DC0", alpha = 0.5)
ax.set_xlabel ('Itérations')
ax.set_ylabel ('Gain cumulé')
ax.legend()
plt.show()

Problème du bandit à $K$ bras¶

Simulation du problème du bandit à $K$ bras¶

Formalisation mathématique :¶

Simulation du problème :¶

Illustration du problème avec une animation:¶

Stratégie Explore then commit¶

Stratégie Upper confidence bound¶

Comparaison entre les trois stratégies (Aléatoire, Explore then commit, Upper confidence bound)¶