-
Notifications
You must be signed in to change notification settings - Fork 0
/
lab7_4.py
44 lines (36 loc) · 1.24 KB
/
lab7_4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import numpy as np
import matplotlib.pyplot as plt
# Initialize variables
n_arms = 10
n_steps = 10000
epsilon = 0.1
alpha = 0.1 # Forgetting factor
# Initialize rewards and estimates
true_rewards = np.zeros(n_arms)
estimated_rewards = np.zeros(n_arms)
action_counts = np.zeros(n_arms)
cumulative_rewards = []
# Simulate 10-armed bandit with non-stationary rewards
for step in range(n_steps):
# Non-stationary reward evolution
true_rewards += np.random.normal(0, 0.01, n_arms)
# Epsilon-greedy action selection
if np.random.rand() < epsilon:
action = np.random.randint(n_arms)
else:
action = np.argmax(estimated_rewards)
# Get reward from the selected arm
reward = np.random.normal(true_rewards[action], 1)
# Update estimated rewards with forgetting factor
estimated_rewards[action] += alpha * (reward - estimated_rewards[action])
# Track cumulative rewards
if step == 0:
cumulative_rewards.append(reward)
else:
cumulative_rewards.append(cumulative_rewards[-1] + reward)
# Plot the cumulative reward graph
plt.plot(cumulative_rewards)
plt.xlabel('Iterations')
plt.ylabel('Cumulative Reward')
plt.title('Modified Epsilon-Greedy Algorithm Performance')
plt.show()