-
Notifications
You must be signed in to change notification settings - Fork 8
/
bandits.py
66 lines (45 loc) · 2.02 KB
/
bandits.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import numpy as np
import pandas as pd
class BayesianBandits(object):
def __init__(self,
arms=[0,0.5],
window=50,
rolling=1,
prev=5):
self.arms = arms
self.a = {i:1 for i in range(len(self.arms))}
self.b = {i:1 for i in range(len(self.arms))}
self.window = window
self.rolling = rolling
self.arm = 0
self.value = self.arms[self.arm]
self.prev = prev
self.choices = [self.arm]
self.data = [-99999]
def sample(self):
if len(self.choices) > self.prev:
samples = []
for i in range(len(self.arms)):
samples.append(np.random.beta(self.a[i], self.b[i]))
best = np.argmax(samples)
self.arm = best
else:
self.arm = int(np.random.uniform() * len(self.arms))
self.value = self.arms[self.arm]
self.choices.append(self.arm)
return(self.value)
def update_dists(self, prev):
self.data.append(prev)
choices = self.choices.copy()
choices.append(9999)
if len(choices) > self.prev:
df = pd.DataFrame({'Choice': choices, 'Reward': self.data})
df['Max_0'] = df.Reward.rolling(5).max().shift(1)
df['Max_Ahead'] = df.Reward.rolling(self.rolling).max().shift(-self.rolling)
df['Feedback'] = df['Max_Ahead'] - df['Max_0']
df = df[~df['Feedback'].isna()].reset_index(drop=True)
df['Bern'] = (np.sign(df.Feedback)+1)/2
df = df.iloc[1:, :].reset_index(drop=True)
df = df.iloc[-self.window:].reset_index(drop=True)
self.a = {i:(1+df[df['Choice'] ==i].sum()['Bern']) for i in range(len(self.arms))}
self.b = {i:(1+df[df['Choice'] ==i].count()['Bern'] - df[df['Choice'] ==i].sum()['Bern']) for i in range(len(self.arms))}