I am interested in reinforcement learning.

It is difficult for me. @_@

I tried to implement very simple and famous problem called ‘multi-armed bandit’.

Image from wikipedia..

The multi-armed bandit problem is a problem in which a fixed limited set of resources must be allocated between competing (alternative) choices in a way that maximizes their expected gain, when each choice’s properties are only partially known at the time of allocation, and may become better understood as time passes or by allocating resources to the choice.

Following code is sample code of the problem. I tried random, greedy, epsilon-greedy and ucb1.

Epsilon greedy algorithm has parameter called epsilon that controls selection method random or greedy.

Greedy algorithm sometime causes mislead because it often fall to local optimum.

My results was indicated epsilon-greedy was best. ??? Am I something wrong?

I need to check my code…..

import numpy as np
import random
ARM_PROB = [ 0.2, 0.3, 0.4, 0.5 ]
class MultiArmedbandit(object):
def __init__(self, n_try, k_arm = len(ARM_PROB)):
self.n_try = n_try
self.k_arm = k_arm
self.counts = np.zeros(self.k_arm, dtype=np.float32)
self.values = np.zeros(self.k_arm, dtype=np.float32)
self.rewards = np.zeros(self.k_arm, dtype=np.float32)
def update(self, action):
# action = random.choice(range(self.k_arm))
r = get_reward(action)
self.counts[action] += 1
self.values[action] += r
self.rewards[action] = self.values[action]/self.counts[action]
def get_reward(action):
if random.random() < ARM_PROB[action]:
reward = 1.
else:
reward = 0.
return reward
def randomchoice(bandit):
action = random.choice(range(bandit.k_arm))
return action
def greedy(bandit):
counts = bandit.counts
values = bandit.values
for i in range(bandit.k_arm):
if counts[i] == 0:
action = i
return action
average_reward = values / counts
action = np.argmax(average_reward)
return action
def epsilon_greedy(bandit, epsilon): # epsilon <= 1.
if random.random() < epsilon:
action = randomchoice(bandit)
else:
action = greedy(bandit)
return action
def ucb1(bandit):
ucb1s = []
for i in range(bandit.k_arm):
ri = bandit.values[i]
ni = bandit.counts[i]
if ni == 0:
return i
n = np.sum(bandit.counts)
ucb = ri/ni + np.sqrt(2 * np.log2(n) / ni)
ucb1s.append(ucb)
action = np.argmax(ucb1s)
return action

import bandit_exp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
rand = bandit_exp.MultiArmedbandit(10000,4)
g = bandit_exp.MultiArmedbandit(10000,4)
eg = bandit_exp.MultiArmedbandit(10000,4)
ucb = bandit_exp.MultiArmedbandit(10000,4)
times =[i for i in range( 10000 )]
randdata = []
gdata = []
egdata = []
ucbdata = []
for i in range(rand.n_try):
rand.update(bandit_exp.randomchoice(rand))
g.update(bandit_exp.greedy(g))
eg.update(bandit_exp.epsilon_greedy(eg,0.1))
ucb.update(bandit_exp.ucb1(ucb))
randdata.append( np.sum(rand.values) / np.sum(rand.counts))
gdata.append( np.sum(g.values) / np.sum(g.counts))
egdata.append( np.sum(eg.values) / np.sum(eg.counts))
ucbdata.append( np.sum(ucb.values) / np.sum(ucb.counts))
df = pd.DataFrame()
df['time'] = times
df['random'] = randdata
df['greedy'] = gdata
df['epsilongreedy'] = egdata
df['ucbdata'] = ucbdata
l1=plt.plot(df.time, df.random, label='random')
l2=plt.plot(df.time, df.greedy, label='greedy')
l3=plt.plot(df.time, df.epsilongreedy, label='e-greedy0.1')
l4=plt.plot(df.time, df.ucbdata, label='ucb1')
plt.legend(loc='upper right')
plt.savefig('plt.png')
plt.show()