-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_minibatch.py
419 lines (334 loc) · 15.7 KB
/
main_minibatch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
import pygame
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.preprocessing.text import one_hot
from keras_preprocessing.sequence import pad_sequences
import random
import time
import openai_vision as eye
import asyncio
import json
from collections import deque
# Initialize Pygame and create a screen
pygame.init()
width, height = 400, 400
screen = pygame.display.set_mode((width, height))
clock = pygame.time.Clock()
font = pygame.font.SysFont(None, 36)
# Neural Network and episode counter setup
model_path = 'model.h5'
success_path = 'successes.txt'
attempts_path = 'attempts.txt'
epsilon_path = 'epsilon.txt'
# shared resources (Public)
# Create an asyncio Queue for vision data
vision_data_queue = asyncio.Queue()
time_limit = 5
# Define a buffer to store experiences
experience_buffer = []
buffer_path = 'experience_buffer.json' # File to store the experience buffer
buffer_limit = 500 # Maximum size of the buffer
minibatch_size = 16
# For text embedding
max_length = 10
# Initialize ε (epsilon) parameters
epsilon_min = 0.01 # Minimum value of ε
epsilon_decay = 0.995 # Decay rate of ε per episode
def load_experience_buffer():
try:
with open(buffer_path, 'r') as file:
# Check if the file is empty
if file.tell() == 0:
return []
else:
file.seek(0) # Reset file read position
return json.load(file)
except FileNotFoundError:
return []
except json.JSONDecodeError:
print("JSON file is empty or corrupted. Initializing a new buffer.")
return []
def save_experience_buffer(buffer):
with open(buffer_path, 'w') as file:
# Convert buffer to a format that can be JSON serialized
serializable_buffer = []
for experience in buffer:
state, action, reward, next_state, done = experience
serializable_buffer.append({
'state': np.array(state).tolist(), # Convert NumPy array to list
'action': int(action), # Convert NumPy int64 to Python int
'reward': float(reward), # Convert NumPy float64 to Python float if necessary
'next_state': np.array(next_state).tolist(), # Convert NumPy array to list
'done': bool(done) # Convert NumPy bool_ to Python bool if necessary
})
json.dump(serializable_buffer, file)
def load_file(counter_file, default_value=None):
try:
with open(counter_file, 'r') as file:
return file.read()
except FileNotFoundError:
if default_value:
return default_value
else:
return 0 # If the file does not exist, use the default starting value
def save_file(counter_file, counter_value):
with open(counter_file, 'w') as file:
file.write(str(counter_value))
def initialize_model(model_path):
"""
Initialize the neural network model.
:param model_path: Path to the saved model.
:return: The loaded or newly created model.
"""
try:
model = tf.keras.models.load_model(model_path)
print("Model loaded successfully.")
return model
except IOError:
input_shape = (2 + max_length,) # Adjust based on state size and max_length
model = Sequential([
Dense(64, activation='relu', input_shape=input_shape),
Dense(32, activation='relu'),
Dense(5, activation='softmax')
])
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
# Reset the success and attempts counters by writing zero to their files
successes = 0
attempts = 0
save_file(success_path, successes)
save_file(attempts_path, attempts)
save_file(epsilon_path, 1.0)
print("Counters reset.")
print("New model created.")
return model
def get_state(target, agent, default_vision_data_length=10):
basic_state = np.array([target[0] - agent[0], target[1] - agent[1]])
# Initialize vision data with zeros
vision_data = np.zeros(default_vision_data_length)
extended_state = np.concatenate((basic_state, vision_data))
return extended_state
# Define a function to train the model
def train_model(state, action, reward, next_state, done, model):
target = reward
if not done:
target = reward + 0.99 * np.amax(model.predict(next_state.reshape(1, -1))[0])
target_f = model.predict(state.reshape(1, -1))
target_f[0][action] = target
model.fit(state.reshape(1, -1), target_f, epochs=1, verbose=0)
def train_model_minibatch(states, actions, rewards, next_states, dones, model, minibatch_size):
"""
Trains the neural network model on a minibatch of states, actions, rewards,
next states, and done flags.
"""
if len(states) < minibatch_size:
raise ValueError("Not enough samples to create a minibatch.")
# Sample a minibatch of experiences
minibatch_indices = np.random.choice(range(len(states)), minibatch_size, replace=False)
minibatch_states = np.array([states[i] for i in minibatch_indices])
minibatch_actions = [actions[i] for i in minibatch_indices]
minibatch_rewards = [rewards[i] for i in minibatch_indices]
minibatch_next_states = np.array([next_states[i] for i in minibatch_indices])
minibatch_dones = [dones[i] for i in minibatch_indices]
# Predict Q-values for the next states for the entire minibatch
q_values_next = model.predict(minibatch_next_states)
# Compute the maximum Q-value for each next state
max_q_values_next = np.max(q_values_next, axis=1)
# Calculate the target Q-values for the current states
target_q_values = minibatch_rewards + (1 - np.array(minibatch_dones)) * 0.99 * max_q_values_next
# Prepare the target Q-values for training
target_f = model.predict(minibatch_states)
for i, action in enumerate(minibatch_actions):
target_f[i][action] = target_q_values[i]
# Fit the model
model.fit(minibatch_states, target_f, epochs=1, verbose=0)
def process_description(description, vocab_size=10000, max_length=10):
# Encode the description using one-hot encoding
encoded = one_hot(description, vocab_size)
# Pad the encoded description
padded = pad_sequences([encoded], maxlen=max_length, padding='post')
# Flatten the padded description
vision_data = padded.flatten()
return vision_data
def extract_description(response):
"""
Extract the descriptive text about the image from the response.
:param response: The response dictionary from see_computer_screen.
:return: The extracted description as a string.
"""
try:
# Access the 'content' field of the 'message' in the first 'choices' element
description = response['choices'][0]['message']['content']
return description.strip() # Remove any leading/trailing whitespace
except (KeyError, IndexError, TypeError):
# Return a default message or handle the error as appropriate
return "Description not available."
async def vision_data_fetcher(vision_data_queue):
while True:
# Asynchronous screen reading logic
success, description = await eye.see_computer_screen_async()
if success:
await vision_data_queue.put(description)
await asyncio.sleep(0.1) # Adjust frequency as needed
def update_agent_position(agent, action, step_size):
if action == 0: # Up
return agent[0], agent[1] - step_size
elif action == 1: # Down
return agent[0], agent[1] + step_size
elif action == 2: # Left
return agent[0] - step_size, agent[1]
elif action == 3: # Right
return agent[0] + step_size, agent[1]
else:
return agent # No change in position if action is "seeing" or any other undefined action
async def choose_action_and_update_position(state, agent, epsilon, model, shared_state):
step_size = 5
action_taken = True
basic_state = np.array([state[0] - agent[0], state[1] - agent[1]])
# Define extended_state with default values before the try-except block
extended_state = np.concatenate((basic_state, np.zeros(10))) # Default state with zeros
extended_state = extended_state.reshape(1, -1)
if np.random.rand() <= epsilon:
action = np.random.randint(0, 5)
else:
extended_state = np.concatenate((basic_state, np.zeros(10)))
extended_state = extended_state.reshape(1, -1)
action = np.argmax(model.predict(extended_state))
if action == 4: # "Seeing" action
try:
# Try to get description from the queue without blocking
description = vision_data_queue.get_nowait()
vision_data = process_description(description)
state_with_vision_info = np.concatenate((basic_state, vision_data))
state_with_vision_info = state_with_vision_info.reshape(1, -1)
action = np.argmax(model.predict(state_with_vision_info))
except asyncio.queues.QueueEmpty:
action = np.argmax(model.predict(extended_state)) # Fallback
else:
new_agent_pos = update_agent_position(agent, action, step_size)
return action, new_agent_pos, action_taken
new_agent_pos = update_agent_position(agent, action, step_size)
return action, new_agent_pos, action_taken
def calculate_reward(target, agent, time_remaining, success_threshold, width, successes, action_taken):
"""
Calculate the reward for the agent's current state, and update success and done status.
:param target: Tuple of (x, y) coordinates for the target.
:param agent: Tuple of (x, y) coordinates for the agent.
:param time_remaining: Time remaining for the agent to reach the target.
:param success_threshold: Distance threshold for considering the agent has reached the target.
:param width: Width of the screen, used for scaling the distance-based reward.
:param successes: The current count of successes.
:return: A tuple containing the numerical reward value, updated success counter, and done status.
Additional parameter:
:param action_taken: Boolean indicating if an action was taken.
"""
distance = np.linalg.norm(np.array(target) - np.array(agent))
done = False
inaction_penalty = -0.1 # Define the penalty for inaction
distance_penalty_scale = 1.0 # Increase this value to make distance more impactful
if distance < success_threshold:
reward = 1
successes += 1
done = True
elif time_remaining <= 0:
reward = -1
done = True
else:
# Apply a more aggressive penalty based on the distance
reward = -distance_penalty_scale * distance / width
if not action_taken:
reward += inaction_penalty # Apply penalty for inaction
return reward, successes, done
async def main():
# Initialize shared state with default values
fetcher_task = asyncio.create_task(vision_data_fetcher(vision_data_queue))
model = initialize_model(model_path)
replay_buffer = deque(maxlen=10000)
save_interval = 10 # Save every 100 episodes, adjust as needed
# Main loop
running = True
successes = int(load_file(success_path, 0)) # Load the number of successes
attempts = int(load_file(attempts_path, 0)) # Load the total number of attempts
epsilon = float(load_file(epsilon_path, default_value=1.0)) # Load ε at the start
# At the start of your main loop, initialize a list to keep track of rewards
reward_history = []
running_average_reward = 0
while running:
attempts += 1 # Increment attempts counter
# Initialize the start position and time
agent = (random.randint(0, width), random.randint(0, height))
start_time = time.time()
# Initialize the state
target = (width // 2, height // 2)
state = get_state(target, agent)
# Run episode
done = False
while not done:
screen.fill((0, 0, 0))
pygame.draw.circle(screen, (255, 255, 255), target, 10)
pygame.draw.circle(screen, (255, 0, 0), agent, 10)
# Display the life iteration counter
life_text = font.render(f"Life: {attempts}", True, (255, 255, 255))
screen.blit(life_text, (10, 10))
# Update and render timer
elapsed_time = time.time() - start_time
remaining_time = max(time_limit - elapsed_time, 0)
timer_text = font.render(f"{remaining_time:.2f}s", True, (255, 255, 255))
screen.blit(timer_text, (width - 100, 10))
# Update agent's position based on chosen action
# action, agent, action_taken = choose_action_and_update_position(state, agent, epsilon, model)
action, agent, action_taken = await choose_action_and_update_position(state, agent, epsilon, model, vision_data_queue)
next_state = get_state(target, agent)
# Calculate reward and check if the episode is done
reward, successes, done = calculate_reward(target, agent, remaining_time, 10, width, successes, action_taken)
# Add the experience to the replay buffer
replay_buffer.append((state, action, reward, next_state, done))
# Sample a minibatch from the replay buffer to train your model
if len(replay_buffer) >= minibatch_size:
minibatch = random.sample(replay_buffer, minibatch_size)
train_model_minibatch(*zip(*minibatch), model, minibatch_size)
# Periodically save the replay buffer and model
if attempts % save_interval == 0:
save_experience_buffer(list(replay_buffer)) # Convert replay_buffer to a list before saving
model.save(model_path)
save_file(success_path, successes)
save_file(attempts_path, attempts)
save_file(epsilon_path, epsilon)
print(f"Checkpoint saved at episode {attempts}")
# Display the success ratio
success_ratio = successes / attempts if attempts > 0 else 0
success_ratio_text = font.render(f"Success ratio: {successes}/{attempts} = {success_ratio:.2f}", True, (255, 255, 255))
screen.blit(success_ratio_text, (10, height - 30))
pygame.display.flip()
#clock.tick(60)
await asyncio.sleep(1/60)
# Process Pygame events
for event in pygame.event.get():
if event.type is pygame.QUIT:
running = False
done = True
# After updating the reward within your loop, append to reward_history and calculate running average
reward_history.append(reward)
if len(reward_history) > 100:
reward_history = reward_history[1:] # Keep only the last 100 rewards
running_average_reward = np.mean(reward_history)
print(f"Running Average Reward: {running_average_reward}")
# # If the episode is done (either success or failure), train the model
# if done:
# train_model(state, action, reward, next_state, done, model)
# Update ε after each episode
epsilon = max(epsilon_min, epsilon_decay * epsilon)
# Save the model and counters at the end of each episode
print('Model Saved')
model.save(model_path)
save_file(success_path, successes)
save_file(attempts_path, attempts)
save_file(epsilon_path, epsilon)
save_experience_buffer(experience_buffer)
# Quit Pygame
pygame.quit()
fetcher_task.cancel()
if __name__ == "__main__":
asyncio.run(main())