diff --git a/data/policies/design_C.1/model_1.0/acrobot/evolsac/model.zip b/data/policies/design_C.1/model_1.0/acrobot/evolsac/model.zip new file mode 100644 index 00000000..3773a162 Binary files /dev/null and b/data/policies/design_C.1/model_1.0/acrobot/evolsac/model.zip differ diff --git a/data/policies/design_C.1/model_1.0/pendubot/evolsac/model.zip b/data/policies/design_C.1/model_1.0/pendubot/evolsac/model.zip new file mode 100644 index 00000000..72526c82 Binary files /dev/null and b/data/policies/design_C.1/model_1.0/pendubot/evolsac/model.zip differ diff --git a/docker/Dockerfile_evolsac b/docker/Dockerfile_evolsac new file mode 100644 index 00000000..2db3aa05 --- /dev/null +++ b/docker/Dockerfile_evolsac @@ -0,0 +1,55 @@ +From ubuntu:22.04 + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install wget -y && \ + apt-get install unzip -y && \ + apt-get install git -y && \ + apt-get install vim -y && \ + apt-get install python3-pip -y && \ + apt-get install libyaml-cpp-dev -y && \ + #apt install libeigen3-dev -y && \ + apt-get install libpython3.10 -y && \ + apt-get install libx11-6 -y && \ + apt-get install libsm6 -y && \ + apt-get install libxt6 -y && \ + apt-get install libglib2.0-0 -y && \ + apt-get install python3-sphinx -y && \ + apt-get install python3-numpydoc -y && \ + apt-get install python3-sphinx-rtd-theme -y && \ + apt-get install python-is-python3 + +# libeigen3-dev install does not work with apt +RUN wget -O Eigen.zip https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.zip +RUN unzip Eigen.zip +RUN cp -r eigen-3.4.0/Eigen /usr/local/include + +#RUN python -m ensurepip --upgrade +RUN pip install -U pip +RUN pip3 install numpy dill +RUN python3 -m pip install --upgrade pip +RUN python3 -m pip install drake + +RUN python -m pip install torch +RUN python -m pip install stable-baselines3==2.3.2 +RUN python -m pip install evotorch +RUN python -m pip install gymnasium +RUN python -m pip install ffmpeg-python + +# Copy everything +COPY . ./double_pendulum/ + +WORKDIR "/double_pendulum" + +# RUN git checkout v0.1.0 + +RUN make install +RUN make pythonfull + +RUN apt-get -y update +RUN apt-get -y upgrade +RUN apt-get install -y ffmpeg + + +RUN python -m pip install stable-baselines3==2.3.2 \ No newline at end of file diff --git a/examples/reinforcement_learning/evolsac/README.md b/examples/reinforcement_learning/evolsac/README.md new file mode 100644 index 00000000..953a73cd --- /dev/null +++ b/examples/reinforcement_learning/evolsac/README.md @@ -0,0 +1,7 @@ +# EvolSAC training +To train the Evolutionary SAC Agent for both the pendubot and the acrobot, first ensure that the variable robot is consistently set to either acrobot or pendubot in all 3 `main.py` files contained inside the folders `SAC_main_training` and `SNES_finetuning`. + +The scripts below must be ran directly from the folders that contain them to ensure path integrity. + +1. Run `python main.py 3.0 0 0 0` from `SAC_main_training`, which trains the agent according to the surrogate reward function defined in the same file +2. Run `python main.py 3.0 0 0 0 [acrobot/pendubot]` from `SNES_finetuning`, which loads the agent found in step 3, and further trains it based on the performance score defined by the competition's organizers \ No newline at end of file diff --git a/examples/reinforcement_learning/evolsac/SAC_finetuning/environment.py b/examples/reinforcement_learning/evolsac/SAC_finetuning/environment.py new file mode 100644 index 00000000..fcf2bc5d --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SAC_finetuning/environment.py @@ -0,0 +1,109 @@ +import gymnasium as gym +import numpy as np + +class CustomCustomEnv(gym.Env): + def __init__( + self, + dynamics_func, + reset_func, + obs_space=gym.spaces.Box( + np.array([-1.0, -1.0, -1.0, -1.0]), np.array([1.0, 1.0, 1.0, 1.0]) + ), + act_space=gym.spaces.Box(np.array([-1.0, -1.0]), np.array([1.0, 1.0])), + max_episode_steps=1000, + scaling=True, + terminates=True, + ): + self.dynamics_func = dynamics_func + self.reset_func = reset_func + self.observation_space = obs_space + self.action_space = act_space + self.max_episode_steps = max_episode_steps + + self.previous_action = 0 + self.terminates = terminates + + self.observation = self.reset_func() + self.step_counter = 0 + self.stabilisation_mode = False + self.y = [0,0] + self.update_y() + self.scaling = scaling + + l1 = self.dynamics_func.simulator.plant.l[0] + l2 = self.dynamics_func.simulator.plant.l[1] + self.max_height = l1 + l2 + + if self.dynamics_func.robot == "acrobot": + self.control_line = 0.75 * self.max_height + elif self.dynamics_func.robot == "pendubot": + self.control_line = 0.7 * self.max_height + + self.old_obs = None + + def step(self, action): + self.old_obs = np.copy(self.observation) + self.observation = self.dynamics_func( + self.observation, action, scaling=self.scaling + ) + + self.update_y() + self.stabilisation_mode = self.y[1] >= self.control_line + terminated = self.terminated_func() + reward = self.reward_func(terminated, action) + info = {} + truncated = False + self.step_counter += 1 + if self.step_counter >= self.max_episode_steps: + truncated = True + self.previous_action = action[0] + return self.observation, reward, terminated, truncated, info + + def reset(self, seed=None, options=None): + super().reset(seed=seed) + self.observation = self.reset_func() + self.step_counter = 0 + info = {} + self.previous_action = 0 + self.stabilisation_mode = False + self.old_obs = np.copy(self.observation) + return self.observation, info + + def render(self, mode="human"): + pass + + def reward_func(self, terminated, action): + raise NotImplementedError("You have to define the reward function") + + def terminated_func(self): + if self.terminates: + # Checks if we're in stabilisation mode and the ee has fallen below the control line + if self.stabilisation_mode and self.y[1] < self.control_line: + return True + return False + + # Update the y coordinate of the first joint and the end effector + def update_y(self): + theta1, theta2, _, _ = self.dynamics_func.unscale_state(self.observation) + + link_end_points = self.dynamics_func.simulator.plant.forward_kinematics( + [theta1, theta2] + ) + self.y[0] = link_end_points[0][1] + self.y[1] = link_end_points[1][1] + + def gravitational_reward(self): + x = self.dynamics_func.unscale_state(self.observation) + V = self.dynamics_func.simulator.plant.potential_energy(x) + return V + + def V(self): + return self.gravitational_reward() + + def kinetic_reward(self): + x = self.dynamics_func.unscale_state(self.observation) + T = self.dynamics_func.simulator.plant.kinetic_energy(x) + return T + + def T(self): + return self.kinetic_reward() diff --git a/examples/reinforcement_learning/evolsac/SAC_finetuning/magic.py b/examples/reinforcement_learning/evolsac/SAC_finetuning/magic.py new file mode 100644 index 00000000..5efe9be8 --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SAC_finetuning/magic.py @@ -0,0 +1,255 @@ +import os +import shutil + +import numpy as np +import pandas +from double_pendulum.analysis.leaderboard import leaderboard_scores +from double_pendulum.controller.evolsac.evolsac_controller import EvolSACController +from double_pendulum.model.model_parameters import model_parameters +from sim_controller import simulate_controller +from stable_baselines3.common.callbacks import BaseCallback + + +def load_controller(dynamics_func, model, window_size, include_time): + name = "evolsac" + leaderboard_config = { + "csv_path": name + "/sim_swingup.csv", + "name": name, + "simple_name": name, + "short_description": "SAC finetuning for both swingup and stabilisation", + "readme_path": f"readmes/{name}.md", + "username": "MarcoCali0", + } + controller = EvolSACController( + model=model, + dynamics_func=dynamics_func, + window_size=window_size, + include_time=include_time, + ) + controller.init() + return controller, leaderboard_config + + +def magic_score( + dynamics_func, + model, + folder, + folder_id, + window_size, + max_torque, + include_time, + index=None, +): + design = "design_C.1" + integrator = "runge_kutta" + dt = 0.002 + t0 = 0.0 + t_final = 10.0 + x0 = [0.0, 0.0, 0.0, 0.0] + goal = [np.pi, 0.0, 0.0, 0.0] + + if folder == "acrobot": + model_par_path = ( + "../../../../data/system_identification/identified_parameters/" + + design + + "/" + + "model_1.1" + + "/model_parameters.yml" + ) + mpar = model_parameters(filepath=model_par_path) + mpar.set_torque_limit([0.0, max_torque]) + + else: # robot = "pendubot" + model_par_path = ( + "../../../../data/system_identification/identified_parameters/" + + design + + "/" + + "model_1.1" + + "/model_parameters.yml" + ) + mpar = model_parameters(filepath=model_par_path) + mpar.set_torque_limit([max_torque, 0.0]) + + controller, leaderboard_config = load_controller( + dynamics_func, model, window_size, include_time + ) + + data_dir = f"data_{folder}/{folder_id}" + + if not os.path.exists(data_dir): + os.makedirs(data_dir) + + controller_name = f"evolsac_{folder}" + save_dir = ( + f"{data_dir}/{controller_name}" + if index is None + else f"{data_dir}/{controller_name}/{index}" + ) + if index is not None: + leaderboard_config["csv_path"] = f"{controller_name}/{index}/sim_swingup.csv" + save_to = os.path.join(save_dir, "leaderboard_entry.csv") + + simulate_controller( + controller, save_dir, mpar, dt, t_final, t0, x0, goal, integrator + ) + + conf = leaderboard_config + + conf["csv_path"] = os.path.join(data_dir, leaderboard_config["csv_path"]) + data_paths = {} + data_paths[leaderboard_config["name"]] = conf + + leaderboard_scores( + data_paths=data_paths, + save_to=save_to, + mpar=mpar, + # weights={"swingup_time": 0.5, "max_tau": 0.1, "energy": 0.0, "integ_tau": 0.4, "tau_cost": 0.0, "tau_smoothness": 0.0}, + weights={ + "swingup_time": 1.0, + "max_tau": 0.0, + "energy": 1.0, + "integ_tau": 0.0, + "tau_cost": 1.0, + "tau_smoothness": 1.0, + "velocity_cost": 1.0, + }, + normalize={ + "swingup_time": 20.0, + "max_tau": 1.0, # not used + "energy": 60.0, + "integ_tau": 1.0, # not used + "tau_cost": 20.0, + "tau_smoothness": 0.1, + "velocity_cost": 400, + }, + link_base="", + score_version="v2", + ) + df = pandas.read_csv(save_to) + + # print(df) + # print(np.array(df["RealAI Score"])[0]) + # print(df.sort_values(by=["RealAI Score"], ascending=False).to_markdown(index=False)) + + score = np.array(df["RealAI Score"])[0] + print("RealAI Score = ", score) + return score + + +def copy_files(src_folder, dest_folder): + if not os.path.exists(dest_folder): + os.makedirs(dest_folder) + + for filename in os.listdir(src_folder): + src_file = os.path.join(src_folder, filename) + dest_file = os.path.join(dest_folder, filename) + + if os.path.isfile(src_file): + shutil.copy2(src_file, dest_file) + + +class MagicCallback(BaseCallback): + def __init__( + self, + path, + folder_id, + dynamics_func, + robot, + window_size, + max_torque, + include_time, + ): + super().__init__(False) + self.path = f"{path}{folder_id}" + if not os.path.exists(path): + os.makedirs(path, exist_ok=True) + self.best = -np.inf + self.folder_id = folder_id + self.dynamics_func = dynamics_func + self.robot = robot + self.window_size = window_size + self.max_torque = max_torque + self.include_time = include_time + + def _on_step(self) -> bool: + score = magic_score( + self.dynamics_func, + self.model, + self.robot, + self.folder_id, + self.window_size, + self.max_torque, + self.include_time, + ) + if score >= self.best: + self.best = score + self.model.save(f"{self.path}/best_model") + copy_files(f"./data/{self.folder_id}/sac/", self.path) + return True + +import concurrent.futures + + +class BruteMagicCallback(BaseCallback): + def __init__( + self, + path, + folder_id, + dynamics_func, + robot, + window_size, + max_torque, + include_time, + ): + super().__init__(False) + self.path = f"{path}{folder_id}" + if not os.path.exists(path): + os.makedirs(path, exist_ok=True) + self.folder_id = folder_id + self.dynamics_func = dynamics_func + self.robot = robot + self.window_size = window_size + self.max_torque = max_torque + self.include_time = include_time + self.iteration = 0 + self.executor = concurrent.futures.ThreadPoolExecutor() + + def _on_step(self) -> bool: + self.iteration += 1 + self.executor.submit(lambda: async_store(self.iteration, self)) + return True + + +import tempfile + +from stable_baselines3 import SAC + + +def deepcopy_model(model): + with tempfile.TemporaryDirectory() as tmpdirname: + model_path = os.path.join(tmpdirname, "temp_model") + model.save(model_path) + copied_model = SAC.load(model_path) + return copied_model + + + +def async_store(iteration, callback: BruteMagicCallback): + model = deepcopy_model(callback.model) + _ = magic_score( + callback.dynamics_func, + model, + callback.robot, + callback.folder_id, + callback.window_size, + callback.max_torque, + callback.include_time, + index=iteration, + ) + if not os.path.exists(f"{callback.path}/{iteration}"): + os.makedirs(f"{callback.path}/{iteration}", exist_ok=True) + model.save(f"{callback.path}/{iteration}/best_model") + copy_files( + f"./data/{callback.folder_id}/evolsac/{iteration}", + f"{os.path.join(callback.path, str(iteration))}", + ) diff --git a/examples/reinforcement_learning/evolsac/SAC_finetuning/main.py b/examples/reinforcement_learning/evolsac/SAC_finetuning/main.py new file mode 100644 index 00000000..a640c52f --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SAC_finetuning/main.py @@ -0,0 +1,215 @@ +import os +import sys + +import gymnasium as gym +import numpy as np +import stable_baselines3 +import torch +from double_pendulum.model.model_parameters import model_parameters +from double_pendulum.model.symbolic_plant import SymbolicDoublePendulum +from double_pendulum.simulation.gym_env import double_pendulum_dynamics_func +from double_pendulum.simulation.simulation import Simulator +from environment import CustomCustomEnv +from magic import BruteMagicCallback, MagicCallback +from simulator import CustomSimulator +from stable_baselines3 import SAC +from stable_baselines3.common.callbacks import EvalCallback +from stable_baselines3.common.env_util import make_vec_env +from stable_baselines3.sac.policies import MlpPolicy +from wrappers import * + +np.random.seed(0) +torch.manual_seed(0) +torch.random.manual_seed(0) +torch.backends.cudnn.deterministic = True +stable_baselines3.common.utils.set_random_seed(0) + +class MyEnv(CustomCustomEnv): + def reward_func(self, terminated, action): + _, theta2, omega1, omega2 = self.dynamics_func.unscale_state(self.observation) + costheta2 = np.cos(theta2) + + a = action[0] + delta_action = np.abs(a - self.previous_action) + lambda_delta = 0.05 + lambda_action = 0.02 + lambda_velocities = 0.01 + if not terminated: + if self.stabilisation_mode: + reward = ( + self.V() + + 2 * (1 + costheta2) ** 2 * 2 + - self.T() + - 5 * lambda_action * np.square(a) * 2 + - 3 * lambda_delta * delta_action * 2 + ) + else: + reward = ( + (1 - np.abs(a)) * self.V() + - lambda_action * np.square(a) * 2 + - 2 * lambda_velocities * (omega1**2 + omega2**2) * 2 + - 3 * lambda_delta * delta_action * 2 + ) + else: + reward = -1.0 + return reward + + +assert ( + len(sys.argv) >= 3 +), "Please provide: [max torque] [robustness] [window_size (0 = no window)] [include_time]" +max_torque = float(sys.argv[1]) +robustness = float(sys.argv[2]) +WINDOW_SIZE = int(sys.argv[3]) +INCLUDE_TIME = bool(int(sys.argv[4])) +robot = str(sys.argv[5]) +FOLDER_ID = f"{os.path.basename(__file__)}-{max_torque}-{robustness}-{WINDOW_SIZE}-{int(INCLUDE_TIME)}" +TERMINATION = False + +# define robot variation +# robot = "acrobot" + +# setting log path for the training +log_dir = f"./log_data_{robot}/SAC_training/{FOLDER_ID}" +if not os.path.exists(log_dir): + os.makedirs(log_dir) + +design = "design_C.1" +model = "model_1.1" +model_par_path = ( + "../../../../data/system_identification/identified_parameters/" + + design + + "/" + + model + + "/model_parameters.yml" +) + +# model and reward parameter +max_velocity = 50 +torque_limit = [max_torque, 0] if robot == "pendubot" else [0, max_torque] + +mpar = model_parameters(filepath=model_par_path) +mpar.set_torque_limit(torque_limit) +dt = 0.01 +integrator = "runge_kutta" + +plant = SymbolicDoublePendulum(model_pars=mpar) +simulator = CustomSimulator(plant=plant, robustness=robustness, max_torque=max_torque, robot=robot) +eval_simulator = Simulator(plant=plant) + +# learning environment parameters +state_representation = 2 + +obs_space = gym.spaces.Box(np.array([-1.0] * 4), np.array([1.0] * 4)) +act_space = gym.spaces.Box(np.array([-1]), np.array([1])) +max_steps = 10 / dt + +############################################################################### + +n_envs = 50 +training_steps = 10_000_000 +verbose = 1 +eval_freq = 5000 +n_eval_episodes = 1 +# a patto che i reward istantanei siano piccoli +# 0.01 -> 1500000 -> 7 +# 0.003 -> 1500000 -> 46 +# 0.001 -> 1500000 -> 38 +# 0.0003 -> 1500000 -> 19 +learning_rate = 0.00008 + +############################################################################### + +# initialize double pendulum dynamics +dynamics_func = double_pendulum_dynamics_func( + simulator=simulator, + dt=dt, + integrator=integrator, + robot=robot, + state_representation=state_representation, + max_velocity=max_velocity, + torque_limit=torque_limit, +) + +eval_dynamics_func = double_pendulum_dynamics_func( + simulator=eval_simulator, + dt=dt, + integrator=integrator, + robot=robot, + state_representation=state_representation, + max_velocity=max_velocity, + torque_limit=torque_limit, +) + + +def zero_reset_func(): + observation = [-1.0, -1.0, 0.0, 0.0] + return observation + + +############################################################################### + + +def wrap(env): + if INCLUDE_TIME: + env = TimeAwareWrapper(env, dt=dt) + if WINDOW_SIZE > 0: + env = StateActionHistoryWrapper(env, history_length=WINDOW_SIZE) + return env + + +envs = make_vec_env( + env_id=MyEnv, + wrapper_class=wrap, + n_envs=n_envs, + env_kwargs={ + "dynamics_func": dynamics_func, + "reset_func": zero_reset_func, + "terminates": TERMINATION, + "obs_space": obs_space, + "act_space": act_space, + "max_episode_steps": max_steps, + }, +) + +eval_env = wrap( + MyEnv( + dynamics_func=eval_dynamics_func, + reset_func=zero_reset_func, + obs_space=obs_space, + act_space=act_space, + max_episode_steps=max_steps, + terminates=TERMINATION, + ) +) + +eval_callback = EvalCallback( + eval_env, + callback_after_eval=BruteMagicCallback( + f"./models_{robot}/", + folder_id=FOLDER_ID, + dynamics_func=eval_dynamics_func, + robot=robot, + window_size=WINDOW_SIZE, + max_torque=max_torque, + include_time=INCLUDE_TIME, + ), + best_model_save_path=os.path.join(log_dir, "best_model"), + log_path=log_dir, + eval_freq=eval_freq, + verbose=verbose, + n_eval_episodes=n_eval_episodes, +) + +############################################################################### + +agent = SAC( + MlpPolicy, + envs, + verbose=verbose, + learning_rate=learning_rate, +) + +loaded_model = "/home/alberto_sinigaglia/test_dp_clone/leaderboard/pendubot/pendubot_best_model_0567_0800.zip" if robot == "pendubot" else "/home/alberto_sinigaglia/test_dp_clone/leaderboard/acrobot/acrobot_best_model_0504_0700.zip" +agent.set_parameters(loaded_model) +agent.learn(total_timesteps=training_steps, callback=eval_callback) diff --git a/examples/reinforcement_learning/evolsac/SAC_finetuning/score_check.py b/examples/reinforcement_learning/evolsac/SAC_finetuning/score_check.py new file mode 100644 index 00000000..d5e4632e --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SAC_finetuning/score_check.py @@ -0,0 +1,50 @@ +import os +import csv + +def extract_score_from_file(file_path): + """Extract the last value (RealAI Score) from the scores.csv file.""" + with open(file_path, mode='r') as file: + csv_reader = csv.reader(file) + next(csv_reader) # Skip the header + row = next(csv_reader) # Read the actual data + return float(row[-1]) # Return the last value as the score + +def find_highest_score(root_directory): + """Find the highest score and the corresponding directory.""" + highest_score = float('-inf') + best_directory = None + + # Iterate through directories named '1' to some number + for dir_name in os.listdir(root_directory): + dir_path = os.path.join(root_directory, dir_name) + + # Check if it's a directory + if os.path.isdir(dir_path): + file_path = os.path.join(dir_path, 'scores.csv') + + # Check if the scores.csv file exists in the directory + if os.path.exists(file_path): + try: + score = extract_score_from_file(file_path) + + # Check if this score is the highest + if score > highest_score: + highest_score = score + best_directory = dir_name + except Exception as e: + print(f"Error reading {file_path}: {e}") + + return highest_score, best_directory + +# Specify the root directory where the numbered folders are located + +robot = "acrobot" +root_directory = f'data_{robot}/main.py-3.0-0.0-0-0/evolsac_{robot}' + +# Find the highest score and the directory containing it +highest_score, best_directory = find_highest_score(root_directory) + +if best_directory is not None: + print(f"The highest score for {robot} is {highest_score}, found in directory: {best_directory}") +else: + print("No valid scores found.") \ No newline at end of file diff --git a/examples/reinforcement_learning/evolsac/SAC_finetuning/sim_controller.py b/examples/reinforcement_learning/evolsac/SAC_finetuning/sim_controller.py new file mode 100644 index 00000000..3d40d567 --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SAC_finetuning/sim_controller.py @@ -0,0 +1,54 @@ +import os +import argparse +import importlib +import numpy as np +import matplotlib.pyplot as plt + +from double_pendulum.model.symbolic_plant import SymbolicDoublePendulum +from double_pendulum.model.model_parameters import model_parameters +from double_pendulum.simulation.simulation import Simulator +from double_pendulum.utils.csv_trajectory import save_trajectory +from double_pendulum.utils.plotting import plot_timeseries + + + + +def simulate_controller(controller, save_dir, mpar, dt, t_final, t0, x0, goal, integrator, controller_name=""): + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + plant = SymbolicDoublePendulum(model_pars=mpar) + sim = Simulator(plant=plant) + + T, X, U = sim.simulate_and_animate( + t0=t0, + x0=x0, + tf=t_final, + dt=dt, + controller=controller, + integrator=integrator, + save_video=True, + video_name=os.path.join(save_dir, "sim_video.gif"), + plot_horizontal_line=True, + horizontal_line_height=0.9 * (mpar.l[0] + mpar.l[1]), + scale=0.25, + ) + + save_trajectory(os.path.join(save_dir, "sim_swingup.csv"), T=T, X_meas=X, U_con=U) + + plot_timeseries( + T, + X, + U, + X_meas=sim.meas_x_values, + pos_y_lines=[-np.pi, 0.0, np.pi], + vel_y_lines=[0.0], + tau_y_lines=[-mpar.tl[1], 0.0, mpar.tl[1]], + save_to=os.path.join(save_dir, "timeseries"), + show=False, + scale=0.5, + ) + + if os.path.exists(f"readmes/{controller_name}.md"): + os.system(f"cp readmes/{controller_name}.md {save_dir}/README.md") + diff --git a/examples/reinforcement_learning/evolsac/SAC_finetuning/simulator.py b/examples/reinforcement_learning/evolsac/SAC_finetuning/simulator.py new file mode 100644 index 00000000..416bffaa --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SAC_finetuning/simulator.py @@ -0,0 +1,130 @@ +from copy import deepcopy + +import numpy as np + +from double_pendulum.model.model_parameters import model_parameters +from double_pendulum.simulation.simulation import Simulator + + +def get_par_list(x0, min_rel, max_rel, n): + if x0 != 0: + if n % 2 == 0: + n = n + 1 + li = np.linspace(min_rel, max_rel, n) + else: + li = np.linspace(0, max_rel, n) + par_list = li * x0 + return par_list + + +class CustomSimulator(Simulator): + def __init__(self, plant, robustness,max_torque, robot): + self.base_plant = deepcopy(plant) + design = "design_C.1" + model = "model_1.1" + # robot = "acrobot" + + model_par_path = ( + "../../../../data/system_identification/identified_parameters/" + + design + + "/" + + model + + "/model_parameters.yml" + ) + mpar = model_parameters(filepath=model_par_path) + # SHOULD USE MAX_TORQUE FROM TRAIN_SAC + torque_limit = [0.0, max_torque] if robot == "acrobot" else [max_torque,0.0] + mpar.set_torque_limit(torque_limit) + + self.mpar = mpar + self.robustness = robustness + self.max_torque = max_torque + super().__init__(plant=plant) + + def reset(self): + """ + Reset the Simulator + Resets + - the internal data recorder + - the filter + arguments + - the process noise + - the measurement parameters + - the motor parameters + - perturbations + """ + super().reset() + + self.plant = deepcopy(self.base_plant) + + if np.random.rand() < self.robustness: + N_var = 21 + mpar_vars = [ + "Ir", + "m1r1", + "I1", + "b1", + "cf1", + "m2r2", + "m2", + "I2", + "b2", + "cf2", + ] + + Ir_var_list = np.linspace(0.0, 1e-4, N_var) + m1r1_var_list = get_par_list( + self.mpar.m[0] * self.mpar.r[0], 0.75, 1.25, N_var + ) + I1_var_list = get_par_list(self.mpar.I[0], 0.75, 1.25, N_var) + b1_var_list = np.linspace(-0.1, 0.1, N_var) + cf1_var_list = np.linspace(-0.2, 0.2, N_var) + m2r2_var_list = get_par_list( + self.mpar.m[1] * self.mpar.r[1], 0.75, 1.25, N_var + ) + m2_var_list = get_par_list(self.mpar.m[1], 0.75, 1.25, N_var) + I2_var_list = get_par_list(self.mpar.I[1], 0.75, 1.25, N_var) + b2_var_list = np.linspace(-0.1, 0.1, N_var) + cf2_var_list = np.linspace(-0.2, 0.2, N_var) + + modelpar_var_lists = { + "Ir": Ir_var_list, + "m1r1": m1r1_var_list, + "I1": I1_var_list, + "b1": b1_var_list, + "cf1": cf1_var_list, + "m2r2": m2r2_var_list, + "m2": m2_var_list, + "I2": I2_var_list, + "b2": b2_var_list, + "cf2": cf2_var_list, + } + + mp = np.random.choice(mpar_vars) + var = np.random.choice(modelpar_var_lists[mp]) + + if mp == "Ir": + self.plant.Ir = var + elif mp == "m1r1": + m1 = self.mpar.m[0] + r1 = var / m1 + self.plant.m[0] = m1 + self.plant.com[0] = r1 + elif mp == "I1": + self.plant.I[0] = var + elif mp == "b1": + self.plant.b[0] = var + elif mp == "cf1": + self.plant.coulomb_fric[0] = var + elif mp == "m2r2": + m2 = self.mpar.m[1] + r2 = var / m2 + self.plant.m[1] = m2 + self.plant.com[1] = r2 + elif mp == "m2": + self.plant.m[1] = var + elif mp == "I2": + self.plant.I[1] = var + elif mp == "b2": + self.plant.b[1] = var + elif mp == "cf2": + self.plant.coulomb_fric[1] = var diff --git a/examples/reinforcement_learning/evolsac/SAC_finetuning/test_sac_swing_up.py b/examples/reinforcement_learning/evolsac/SAC_finetuning/test_sac_swing_up.py new file mode 100644 index 00000000..dc78550b --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SAC_finetuning/test_sac_swing_up.py @@ -0,0 +1,92 @@ +import os + +import numpy as np +from controller import SACController +from stable_baselines3 import SAC + +from double_pendulum.model.model_parameters import model_parameters +from double_pendulum.model.symbolic_plant import SymbolicDoublePendulum +from double_pendulum.simulation.gym_env import double_pendulum_dynamics_func +from double_pendulum.simulation.simulation import Simulator +from double_pendulum.utils.plotting import plot_timeseries + +# hyperparameters +robot = "acrobot" + +design = "design_C.1" +model = "model_1.0" +max_torque = 3.5 +scaling_state = True + +if robot == "pendubot": + torque_limit = [max_torque, 0.0] + active_act = 0 + +elif robot == "acrobot": + torque_limit = [0.0, max_torque] + # model_path = "models/max_torque_1.5_robustness_0.2" + model_path = "acrobot_score_0500" + +# import model parameter +model_par_path = ( + "../../../data/system_identification/identified_parameters/" + + design + + "/" + + model + + "/model_parameters.yml" +) +mpar = model_parameters(filepath=model_par_path) + +mpar.set_torque_limit(torque_limit) + +# simulation parameters +dt = 0.01 +t_final = 10.0 +integrator = "runge_kutta" +goal = [np.pi, 0.0, 0.0, 0.0] + +plant = SymbolicDoublePendulum(model_pars=mpar) + +sim = Simulator(plant=plant) + +# initialize double pendulum dynamics +dynamics_func = double_pendulum_dynamics_func( + simulator=sim, + dt=dt, + integrator=integrator, + robot=robot, + state_representation=2, + scaling=scaling_state, + max_velocity=50, + torque_limit=torque_limit, +) + +model = SAC.load(model_path) + +# initialize sac controller +controller = SACController(model, dynamics_func, include_time=False) +controller.init() + +# start simulation +T, X, U = sim.simulate_and_animate( + t0=0.0, + x0=[0.0, 0.0, 0.0, 0.0], + tf=t_final, + dt=dt, + controller=controller, + integrator=integrator, + save_video=True, +) + + +# plot time series +# plot_timeseries( +# T, +# X, +# U, +# X_meas=sim.meas_x_values, +# pos_y_lines=[np.pi], +# tau_y_lines=[-torque_limit[active_act], torque_limit[active_act]], +# save_to=os.path.join("./", "timeseries"), +# show=False, +# ) diff --git a/examples/reinforcement_learning/evolsac/SAC_finetuning/wrappers.py b/examples/reinforcement_learning/evolsac/SAC_finetuning/wrappers.py new file mode 100644 index 00000000..780a0b8e --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SAC_finetuning/wrappers.py @@ -0,0 +1,87 @@ +import gymnasium as gym +from collections import deque +import numpy as np + + +class StateActionHistoryWrapper(gym.Wrapper): + def __init__(self, env, history_length=4): + super().__init__(env) + self.history_length = history_length + self.state_history = np.zeros((history_length, *env.observation_space.shape)) + self.action_history = np.zeros((history_length, *env.action_space.shape)) + + state_space = env.observation_space + action_space = env.action_space + low = np.concatenate( + [state_space.low] * history_length + [action_space.low] * history_length + ) + high = np.concatenate( + [state_space.high] * history_length + [action_space.high] * history_length + ) + self.observation_space = gym.spaces.Box(low=low, high=high, dtype=np.float32) + + def reset(self, **kwargs): + observation = self.env.reset() + self.state_history.fill(0) + self.state_history[-1] = observation[0] + self.action_history.fill(0) + return self._get_observation(), {} + + def step(self, action): + observation, reward, terminated, truncated, info = self.env.step(action) + self.state_history[0:-1] = self.state_history[1:] + self.state_history[-1] = observation + self.action_history[0:-1] = self.action_history[1:] + self.action_history[-1] = action + return self._get_observation(), reward, terminated, truncated, info + + def _get_observation(self): + state_flat = self.state_history.reshape((-1)) + action_flat = self.action_history.reshape((-1)) + return np.concatenate([state_flat, action_flat]) + + +# adds noise to velocity +class PerturbStateWrapper(gym.ObservationWrapper): + def __init__(self, env, perturb_std=0.02): + super().__init__(env) + self.std = perturb_std + + def observation(self, observation): + observation = np.atleast_2d(observation) + observation[:, 2:] = ( + observation[:, 2:] + np.random.randn(*observation[:, 2:].shape) * self.std + ) + return observation + + +# probably useless since it's already perturbed by SAC_main_training +class PerturbActionWrapper(gym.ActionWrapper): + def __init__(self, env, perturb_std=0.01): + super().__init__(env) + self.noise = lambda: np.random.randn(*env.observation_space.shape) * perturb_std + + def observation(self, observation): + return observation + self.noise() + + +class TimeAwareWrapper(gym.ObservationWrapper): + def __init__(self, env, dt): + super().__init__(env) + self.dt = dt + observation_space = env.observation_space + low = np.append(observation_space.low, 0.0) + high = np.append(observation_space.high, 1.0) + self.observation_space = gym.spaces.Box(low, high, dtype=np.float32) + self.t = 0 + + def observation(self, observation): + return np.append(observation, self.t / 10) + + def step(self, action): + self.t += self.dt + return super().step(action) + + def reset(self, **kwargs): + self.t = 0 + return super().reset(**kwargs) diff --git a/examples/reinforcement_learning/evolsac/SAC_main_training/environment.py b/examples/reinforcement_learning/evolsac/SAC_main_training/environment.py new file mode 100644 index 00000000..fcf2bc5d --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SAC_main_training/environment.py @@ -0,0 +1,109 @@ +import gymnasium as gym +import numpy as np + +class CustomCustomEnv(gym.Env): + def __init__( + self, + dynamics_func, + reset_func, + obs_space=gym.spaces.Box( + np.array([-1.0, -1.0, -1.0, -1.0]), np.array([1.0, 1.0, 1.0, 1.0]) + ), + act_space=gym.spaces.Box(np.array([-1.0, -1.0]), np.array([1.0, 1.0])), + max_episode_steps=1000, + scaling=True, + terminates=True, + ): + self.dynamics_func = dynamics_func + self.reset_func = reset_func + self.observation_space = obs_space + self.action_space = act_space + self.max_episode_steps = max_episode_steps + + self.previous_action = 0 + self.terminates = terminates + + self.observation = self.reset_func() + self.step_counter = 0 + self.stabilisation_mode = False + self.y = [0,0] + self.update_y() + self.scaling = scaling + + l1 = self.dynamics_func.simulator.plant.l[0] + l2 = self.dynamics_func.simulator.plant.l[1] + self.max_height = l1 + l2 + + if self.dynamics_func.robot == "acrobot": + self.control_line = 0.75 * self.max_height + elif self.dynamics_func.robot == "pendubot": + self.control_line = 0.7 * self.max_height + + self.old_obs = None + + def step(self, action): + self.old_obs = np.copy(self.observation) + self.observation = self.dynamics_func( + self.observation, action, scaling=self.scaling + ) + + self.update_y() + self.stabilisation_mode = self.y[1] >= self.control_line + terminated = self.terminated_func() + reward = self.reward_func(terminated, action) + info = {} + truncated = False + self.step_counter += 1 + if self.step_counter >= self.max_episode_steps: + truncated = True + self.previous_action = action[0] + return self.observation, reward, terminated, truncated, info + + def reset(self, seed=None, options=None): + super().reset(seed=seed) + self.observation = self.reset_func() + self.step_counter = 0 + info = {} + self.previous_action = 0 + self.stabilisation_mode = False + self.old_obs = np.copy(self.observation) + return self.observation, info + + def render(self, mode="human"): + pass + + def reward_func(self, terminated, action): + raise NotImplementedError("You have to define the reward function") + + def terminated_func(self): + if self.terminates: + # Checks if we're in stabilisation mode and the ee has fallen below the control line + if self.stabilisation_mode and self.y[1] < self.control_line: + return True + return False + + # Update the y coordinate of the first joint and the end effector + def update_y(self): + theta1, theta2, _, _ = self.dynamics_func.unscale_state(self.observation) + + link_end_points = self.dynamics_func.simulator.plant.forward_kinematics( + [theta1, theta2] + ) + self.y[0] = link_end_points[0][1] + self.y[1] = link_end_points[1][1] + + def gravitational_reward(self): + x = self.dynamics_func.unscale_state(self.observation) + V = self.dynamics_func.simulator.plant.potential_energy(x) + return V + + def V(self): + return self.gravitational_reward() + + def kinetic_reward(self): + x = self.dynamics_func.unscale_state(self.observation) + T = self.dynamics_func.simulator.plant.kinetic_energy(x) + return T + + def T(self): + return self.kinetic_reward() diff --git a/examples/reinforcement_learning/evolsac/SAC_main_training/magic.py b/examples/reinforcement_learning/evolsac/SAC_main_training/magic.py new file mode 100644 index 00000000..fb787bfe --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SAC_main_training/magic.py @@ -0,0 +1,250 @@ +import os +import shutil + +import numpy as np +import pandas +from double_pendulum.analysis.leaderboard import leaderboard_scores +from double_pendulum.controller.evolsac.evolsac_controller import EvolSACController +from double_pendulum.model.model_parameters import model_parameters +from sim_controller import simulate_controller +from stable_baselines3.common.callbacks import BaseCallback + + +def load_controller(dynamics_func, model, window_size, include_time): + name = "evolsac" + leaderboard_config = { + "csv_path": name + "/sim_swingup.csv", + "name": name, + "simple_name": name, + "short_description": "SAC finetuning for both swingup and stabilisation", + "readme_path": f"readmes/{name}.md", + "username": "MarcoCali0", + } + controller = EvolSACController( + model=model, + dynamics_func=dynamics_func, + window_size=window_size, + include_time=include_time, + ) + controller.init() + return controller, leaderboard_config + + +def magic_score( + dynamics_func, + model, + folder, + folder_id, + window_size, + max_torque, + include_time, + index=None, +): + design = "design_C.1" + integrator = "runge_kutta" + dt = 0.002 + t0 = 0.0 + t_final = 10.0 + x0 = [0.0, 0.0, 0.0, 0.0] + goal = [np.pi, 0.0, 0.0, 0.0] + + if folder == "acrobot": + model_par_path = ( + f"../../../../data/system_identification/identified_parameters/" + + design + + "/" + + "model_1.1" + + "/model_parameters.yml" + ) + mpar = model_parameters(filepath=model_par_path) + mpar.set_torque_limit([0.0, max_torque]) + + else: # robot = "pendubot" + model_par_path = ( + f"../../../../data/system_identification/identified_parameters/" + + design + + "/" + + "model_1.1" + + "/model_parameters.yml" + ) + mpar = model_parameters(filepath=model_par_path) + mpar.set_torque_limit([max_torque, 0.0]) + + controller, leaderboard_config = load_controller( + dynamics_func, model, window_size, include_time + ) + + data_dir = f"data_{folder}/{folder_id}" + + if not os.path.exists(data_dir): + os.makedirs(data_dir) + + controller_name = f"evolsac_{folder}" + save_dir = ( + f"{data_dir}/{controller_name}" + if index is None + else f"{data_dir}/{controller_name}/{index}" + ) + if index is not None: + leaderboard_config["csv_path"] = f"{controller_name}/{index}/sim_swingup.csv" + save_to = os.path.join(save_dir, "leaderboard_entry.csv") + + simulate_controller( + controller, save_dir, mpar, dt, t_final, t0, x0, goal, integrator + ) + + conf = leaderboard_config + + conf["csv_path"] = os.path.join(data_dir, leaderboard_config["csv_path"]) + data_paths = {} + data_paths[leaderboard_config["name"]] = conf + + leaderboard_scores( + data_paths=data_paths, + save_to=save_to, + mpar=mpar, + # weights={"swingup_time": 0.5, "max_tau": 0.1, "energy": 0.0, "integ_tau": 0.4, "tau_cost": 0.0, "tau_smoothness": 0.0}, + weights={ + "swingup_time": 1.0, + "max_tau": 0.0, + "energy": 1.0, + "integ_tau": 0.0, + "tau_cost": 1.0, + "tau_smoothness": 1.0, + "velocity_cost": 1.0, + }, + normalize={ + "swingup_time": 20.0, + "max_tau": 1.0, # not used + "energy": 60.0, + "integ_tau": 1.0, # not used + "tau_cost": 20.0, + "tau_smoothness": 0.1, + "velocity_cost": 400, + }, + link_base="", + score_version="v2", + ) + df = pandas.read_csv(save_to) + score = np.array(df["RealAI Score"])[0] + print("RealAI Score = ", score) + return score + + +def copy_files(src_folder, dest_folder): + if not os.path.exists(dest_folder): + os.makedirs(dest_folder) + + for filename in os.listdir(src_folder): + src_file = os.path.join(src_folder, filename) + dest_file = os.path.join(dest_folder, filename) + + if os.path.isfile(src_file): + shutil.copy2(src_file, dest_file) + + +class MagicCallback(BaseCallback): + def __init__( + self, + path, + folder_id, + dynamics_func, + robot, + window_size, + max_torque, + include_time, + ): + super().__init__(False) + self.path = f"{path}{folder_id}" + if not os.path.exists(path): + os.makedirs(path, exist_ok=True) + self.best = -np.inf + self.folder_id = folder_id + self.dynamics_func = dynamics_func + self.robot = robot + self.window_size = window_size + self.max_torque = max_torque + self.include_time = include_time + + def _on_step(self) -> bool: + score = magic_score( + self.dynamics_func, + self.model, + self.robot, + self.folder_id, + self.window_size, + self.max_torque, + self.include_time, + ) + if score >= self.best: + self.best = score + self.model.save(f"{self.path}/best_model") + copy_files(f"./data/{self.folder_id}/evolsac/", self.path) + return True + +import concurrent.futures + + +class BruteMagicCallback(BaseCallback): + def __init__( + self, + path, + folder_id, + dynamics_func, + robot, + window_size, + max_torque, + include_time, + ): + super().__init__(False) + self.path = f"{path}{folder_id}" + if not os.path.exists(path): + os.makedirs(path, exist_ok=True) + self.folder_id = folder_id + self.dynamics_func = dynamics_func + self.robot = robot + self.window_size = window_size + self.max_torque = max_torque + self.include_time = include_time + self.iteration = 0 + self.executor = concurrent.futures.ThreadPoolExecutor() + + def _on_step(self) -> bool: + self.iteration += 1 + self.executor.submit(lambda: async_store(self.iteration, self)) + return True + + +import tempfile + +from stable_baselines3 import SAC + + +def deepcopy_model(model): + with tempfile.TemporaryDirectory() as tmpdirname: + model_path = os.path.join(tmpdirname, "temp_model") + model.save(model_path) + copied_model = SAC.load(model_path) + return copied_model + + + +def async_store(iteration, callback: BruteMagicCallback): + model = deepcopy_model(callback.model) + _ = magic_score( + callback.dynamics_func, + model, + callback.robot, + callback.folder_id, + callback.window_size, + callback.max_torque, + callback.include_time, + index=iteration, + ) + if not os.path.exists(f"{callback.path}/{iteration}"): + os.makedirs(f"{callback.path}/{iteration}", exist_ok=True) + model.save(f"{callback.path}/{iteration}/best_model") + copy_files( + f"./data/{callback.folder_id}/evolsac/{iteration}", + f"{os.path.join(callback.path, str(iteration))}", + ) diff --git a/examples/reinforcement_learning/evolsac/SAC_main_training/main.py b/examples/reinforcement_learning/evolsac/SAC_main_training/main.py new file mode 100644 index 00000000..2721296b --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SAC_main_training/main.py @@ -0,0 +1,214 @@ +import os +import sys + +import gymnasium as gym +import numpy as np +import stable_baselines3 +import torch +from environment import CustomCustomEnv +from magic import MagicCallback, BruteMagicCallback +from simulator import CustomSimulator +from stable_baselines3 import SAC +from stable_baselines3.common.callbacks import EvalCallback +from stable_baselines3.common.env_util import make_vec_env +from stable_baselines3.sac.policies import MlpPolicy +from wrappers import * + +from double_pendulum.model.model_parameters import model_parameters +from double_pendulum.model.symbolic_plant import SymbolicDoublePendulum +from double_pendulum.simulation.gym_env import double_pendulum_dynamics_func +from double_pendulum.simulation.simulation import Simulator + +np.random.seed(0) +torch.manual_seed(0) +torch.random.manual_seed(0) +torch.backends.cudnn.deterministic = True +stable_baselines3.common.utils.set_random_seed(0) + + +class MyEnv(CustomCustomEnv): + def reward_func(self, terminated, action): + _, theta2, omega1, omega2 = self.dynamics_func.unscale_state(self.observation) + costheta2 = np.cos(theta2) + + a = action[0] + delta_action = np.abs(a - self.previous_action) + lambda_delta = 0.05 + lambda_action = 0.02 + lambda_velocities = 0.01 + if not terminated: + if self.stabilisation_mode: + reward = ( + self.V() + + 2 * (1 + costheta2) ** 2 + - self.T() + - 5 * lambda_action * np.square(a) + - 3 * lambda_delta * delta_action + ) + else: + reward = ( + self.V() + - lambda_action * np.square(a) + - 2 * lambda_velocities * (omega1**2 + omega2**2) + - 3 * lambda_delta * delta_action + ) + else: + reward = -1.0 + return reward + + +assert ( + len(sys.argv) >= 3 +), "Please provide: [max torque] [robustness] [window_size (0 = no window)] [include_time]" +max_torque = float(sys.argv[1]) +robustness = float(sys.argv[2]) +WINDOW_SIZE = int(sys.argv[3]) +INCLUDE_TIME = bool(int(sys.argv[4])) +FOLDER_ID = f"{os.path.basename(__file__)}-{max_torque}-{robustness}-{WINDOW_SIZE}-{int(INCLUDE_TIME)}" +TERMINATION = False + +# define robot variation +robot = "pendubot" + +# setting log path for the training +log_dir = f"./log_data_{robot}/SAC_training/{FOLDER_ID}" +if not os.path.exists(log_dir): + os.makedirs(log_dir) + +design = "design_C.1" +model = "model_1.1" +model_par_path = ( + "../../../../data/system_identification/identified_parameters/" + + design + + "/" + + model + + "/model_parameters.yml" +) + +# model and reward parameter +max_velocity = 50 +torque_limit = [max_torque, 0] if robot == "pendubot" else [0, max_torque] + +mpar = model_parameters(filepath=model_par_path) +mpar.set_torque_limit(torque_limit) +dt = 0.01 +integrator = "runge_kutta" + +plant = SymbolicDoublePendulum(model_pars=mpar) +simulator = CustomSimulator(plant=plant, robustness=robustness, max_torque=max_torque) +eval_simulator = Simulator(plant=plant) + +# learning environment parameters +state_representation = 2 + +obs_space = gym.spaces.Box(np.array([-1.0] * 4), np.array([1.0] * 4)) +act_space = gym.spaces.Box(np.array([-1]), np.array([1])) +max_steps = 10 / dt + +############################################################################### + +n_envs = 50 +training_steps = 30_000_000_000_000 +verbose = 1 +eval_freq = 10_000 +n_eval_episodes = 1 +# a patto che i reward istantanei siano piccoli +# 0.01 -> 1500000 -> 7 +# 0.003 -> 1500000 -> 46 +# 0.001 -> 1500000 -> 38 +# 0.0003 -> 1500000 -> 19 +learning_rate = 0.001 + +############################################################################### + +# initialize double pendulum dynamics +dynamics_func = double_pendulum_dynamics_func( + simulator=simulator, + dt=dt, + integrator=integrator, + robot=robot, + state_representation=state_representation, + max_velocity=max_velocity, + torque_limit=torque_limit, +) + +eval_dynamics_func = double_pendulum_dynamics_func( + simulator=eval_simulator, + dt=dt, + integrator=integrator, + robot=robot, + state_representation=state_representation, + max_velocity=max_velocity, + torque_limit=torque_limit, +) + + +def zero_reset_func(): + observation = [-1.0, -1.0, 0.0, 0.0] + return observation + + +############################################################################### + + +def wrap(env): + if INCLUDE_TIME: + env = TimeAwareWrapper(env, dt=dt) + if WINDOW_SIZE > 0: + env = StateActionHistoryWrapper(env, history_length=WINDOW_SIZE) + return env + + +envs = make_vec_env( + env_id=MyEnv, + wrapper_class=wrap, + n_envs=n_envs, + env_kwargs={ + "dynamics_func": dynamics_func, + "reset_func": zero_reset_func, + "terminates": TERMINATION, + "obs_space": obs_space, + "act_space": act_space, + "max_episode_steps": max_steps, + }, +) + +eval_env = wrap( + MyEnv( + dynamics_func=eval_dynamics_func, + reset_func=zero_reset_func, + obs_space=obs_space, + act_space=act_space, + max_episode_steps=max_steps, + terminates=TERMINATION, + ) +) + +eval_callback = EvalCallback( + eval_env, + callback_after_eval=BruteMagicCallback( + f"./models_{robot}/", + folder_id=FOLDER_ID, + dynamics_func=eval_dynamics_func, + robot=robot, + window_size=WINDOW_SIZE, + max_torque=max_torque, + include_time=INCLUDE_TIME, + ), + best_model_save_path=os.path.join(log_dir, "best_model"), + log_path=log_dir, + eval_freq=eval_freq, + verbose=verbose, + n_eval_episodes=n_eval_episodes, +) + +############################################################################### + +agent = SAC( + MlpPolicy, + envs, + verbose=verbose, + learning_rate=learning_rate, +) + +agent.learn(total_timesteps=training_steps, callback=eval_callback) diff --git a/examples/reinforcement_learning/evolsac/SAC_main_training/sim_controller.py b/examples/reinforcement_learning/evolsac/SAC_main_training/sim_controller.py new file mode 100644 index 00000000..f8cc2666 --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SAC_main_training/sim_controller.py @@ -0,0 +1,50 @@ +import os +import numpy as np + +from double_pendulum.model.symbolic_plant import SymbolicDoublePendulum +from double_pendulum.simulation.simulation import Simulator +from double_pendulum.utils.csv_trajectory import save_trajectory +from double_pendulum.utils.plotting import plot_timeseries + + + + +def simulate_controller(controller, save_dir, mpar, dt, t_final, t0, x0, goal, integrator, controller_name=""): + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + plant = SymbolicDoublePendulum(model_pars=mpar) + sim = Simulator(plant=plant) + + T, X, U = sim.simulate_and_animate( + t0=t0, + x0=x0, + tf=t_final, + dt=dt, + controller=controller, + integrator=integrator, + save_video=True, + video_name=os.path.join(save_dir, "sim_video.gif"), + plot_horizontal_line=True, + horizontal_line_height=0.9 * (mpar.l[0] + mpar.l[1]), + scale=0.25, + ) + + save_trajectory(os.path.join(save_dir, "sim_swingup.csv"), T=T, X_meas=X, U_con=U) + + plot_timeseries( + T, + X, + U, + X_meas=sim.meas_x_values, + pos_y_lines=[-np.pi, 0.0, np.pi], + vel_y_lines=[0.0], + tau_y_lines=[-mpar.tl[1], 0.0, mpar.tl[1]], + save_to=os.path.join(save_dir, "timeseries"), + show=False, + scale=0.5, + ) + + if os.path.exists(f"readmes/{controller_name}.md"): + os.system(f"cp readmes/{controller_name}.md {save_dir}/README.md") + diff --git a/examples/reinforcement_learning/evolsac/SAC_main_training/simulator.py b/examples/reinforcement_learning/evolsac/SAC_main_training/simulator.py new file mode 100644 index 00000000..e8b61f3c --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SAC_main_training/simulator.py @@ -0,0 +1,129 @@ +from copy import deepcopy + +import numpy as np +from double_pendulum.model.model_parameters import model_parameters +from double_pendulum.simulation.simulation import Simulator + + +def get_par_list(x0, min_rel, max_rel, n): + if x0 != 0: + if n % 2 == 0: + n = n + 1 + li = np.linspace(min_rel, max_rel, n) + else: + li = np.linspace(0, max_rel, n) + par_list = li * x0 + return par_list + + +class CustomSimulator(Simulator): + def __init__(self, plant, robustness, max_torque): + self.base_plant = deepcopy(plant) + design = "design_C.1" + model = "model_1.1" + robot = "acrobot" + + model_par_path = ( + "../../../../data/system_identification/identified_parameters/" + + design + + "/" + + model + + "/model_parameters.yml" + ) + mpar = model_parameters(filepath=model_par_path) + # SHOULD USE MAX_TORQUE FROM TRAIN_SAC + torque_limit = [0.0, max_torque] if robot == "acrobot" else [max_torque,0.0] + mpar.set_torque_limit(torque_limit) + + self.mpar = mpar + self.robustness = robustness + self.max_torque = max_torque + super().__init__(plant=plant) + + def reset(self): + """ + Reset the Simulator + Resets + - the internal data recorder + - the filter + arguments + - the process noise + - the measurement parameters + - the motor parameters + - perturbations + """ + super().reset() + + self.plant = deepcopy(self.base_plant) + + if np.random.rand() < self.robustness: + N_var = 21 + mpar_vars = [ + "Ir", + "m1r1", + "I1", + "b1", + "cf1", + "m2r2", + "m2", + "I2", + "b2", + "cf2", + ] + + Ir_var_list = np.linspace(0.0, 1e-4, N_var) + m1r1_var_list = get_par_list( + self.mpar.m[0] * self.mpar.r[0], 0.75, 1.25, N_var + ) + I1_var_list = get_par_list(self.mpar.I[0], 0.75, 1.25, N_var) + b1_var_list = np.linspace(-0.1, 0.1, N_var) + cf1_var_list = np.linspace(-0.2, 0.2, N_var) + m2r2_var_list = get_par_list( + self.mpar.m[1] * self.mpar.r[1], 0.75, 1.25, N_var + ) + m2_var_list = get_par_list(self.mpar.m[1], 0.75, 1.25, N_var) + I2_var_list = get_par_list(self.mpar.I[1], 0.75, 1.25, N_var) + b2_var_list = np.linspace(-0.1, 0.1, N_var) + cf2_var_list = np.linspace(-0.2, 0.2, N_var) + + modelpar_var_lists = { + "Ir": Ir_var_list, + "m1r1": m1r1_var_list, + "I1": I1_var_list, + "b1": b1_var_list, + "cf1": cf1_var_list, + "m2r2": m2r2_var_list, + "m2": m2_var_list, + "I2": I2_var_list, + "b2": b2_var_list, + "cf2": cf2_var_list, + } + + mp = np.random.choice(mpar_vars) + var = np.random.choice(modelpar_var_lists[mp]) + + if mp == "Ir": + self.plant.Ir = var + elif mp == "m1r1": + m1 = self.mpar.m[0] + r1 = var / m1 + self.plant.m[0] = m1 + self.plant.com[0] = r1 + elif mp == "I1": + self.plant.I[0] = var + elif mp == "b1": + self.plant.b[0] = var + elif mp == "cf1": + self.plant.coulomb_fric[0] = var + elif mp == "m2r2": + m2 = self.mpar.m[1] + r2 = var / m2 + self.plant.m[1] = m2 + self.plant.com[1] = r2 + elif mp == "m2": + self.plant.m[1] = var + elif mp == "I2": + self.plant.I[1] = var + elif mp == "b2": + self.plant.b[1] = var + elif mp == "cf2": + self.plant.coulomb_fric[1] = var diff --git a/examples/reinforcement_learning/evolsac/SAC_main_training/test_sac_swing_up.py b/examples/reinforcement_learning/evolsac/SAC_main_training/test_sac_swing_up.py new file mode 100644 index 00000000..654b4b8c --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SAC_main_training/test_sac_swing_up.py @@ -0,0 +1,92 @@ +import os + +import numpy as np +from controller import SACController +from stable_baselines3 import SAC + +from double_pendulum.model.model_parameters import model_parameters +from double_pendulum.model.symbolic_plant import SymbolicDoublePendulum +from double_pendulum.simulation.gym_env import double_pendulum_dynamics_func +from double_pendulum.simulation.simulation import Simulator +from double_pendulum.utils.plotting import plot_timeseries + +# hyperparameters +robot = "acrobot" + +design = "design_C.1" +model = "model_1.1" +max_torque = 3 +scaling_state = True + +if robot == "pendubot": + torque_limit = [max_torque, 0.0] + active_act = 0 + +elif robot == "acrobot": + torque_limit = [0.0, max_torque] + # model_path = "models/max_torque_1.5_robustness_0.2" + model_path = "acrobot_score_0500" + +# import model parameter +model_par_path = ( + "../../../../data/system_identification/identified_parameters/" + + design + + "/" + + model + + "/model_parameters.yml" +) +mpar = model_parameters(filepath=model_par_path) + +mpar.set_torque_limit(torque_limit) + +# simulation parameters +dt = 0.01 +t_final = 10.0 +integrator = "runge_kutta" +goal = [np.pi, 0.0, 0.0, 0.0] + +plant = SymbolicDoublePendulum(model_pars=mpar) + +sim = Simulator(plant=plant) + +# initialize double pendulum dynamics +dynamics_func = double_pendulum_dynamics_func( + simulator=sim, + dt=dt, + integrator=integrator, + robot=robot, + state_representation=2, + scaling=scaling_state, + max_velocity=50, + torque_limit=torque_limit, +) + +model = SAC.load(model_path) + +# initialize sac controller +controller = SACController(model, dynamics_func, include_time=False) +controller.init() + +# start simulation +T, X, U = sim.simulate_and_animate( + t0=0.0, + x0=[0.0, 0.0, 0.0, 0.0], + tf=t_final, + dt=dt, + controller=controller, + integrator=integrator, + save_video=True, +) + + +# plot time series +# plot_timeseries( +# T, +# X, +# U, +# X_meas=sim.meas_x_values, +# pos_y_lines=[np.pi], +# tau_y_lines=[-torque_limit[active_act], torque_limit[active_act]], +# save_to=os.path.join("./", "timeseries"), +# show=False, +# ) diff --git a/examples/reinforcement_learning/evolsac/SAC_main_training/wrappers.py b/examples/reinforcement_learning/evolsac/SAC_main_training/wrappers.py new file mode 100644 index 00000000..780a0b8e --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SAC_main_training/wrappers.py @@ -0,0 +1,87 @@ +import gymnasium as gym +from collections import deque +import numpy as np + + +class StateActionHistoryWrapper(gym.Wrapper): + def __init__(self, env, history_length=4): + super().__init__(env) + self.history_length = history_length + self.state_history = np.zeros((history_length, *env.observation_space.shape)) + self.action_history = np.zeros((history_length, *env.action_space.shape)) + + state_space = env.observation_space + action_space = env.action_space + low = np.concatenate( + [state_space.low] * history_length + [action_space.low] * history_length + ) + high = np.concatenate( + [state_space.high] * history_length + [action_space.high] * history_length + ) + self.observation_space = gym.spaces.Box(low=low, high=high, dtype=np.float32) + + def reset(self, **kwargs): + observation = self.env.reset() + self.state_history.fill(0) + self.state_history[-1] = observation[0] + self.action_history.fill(0) + return self._get_observation(), {} + + def step(self, action): + observation, reward, terminated, truncated, info = self.env.step(action) + self.state_history[0:-1] = self.state_history[1:] + self.state_history[-1] = observation + self.action_history[0:-1] = self.action_history[1:] + self.action_history[-1] = action + return self._get_observation(), reward, terminated, truncated, info + + def _get_observation(self): + state_flat = self.state_history.reshape((-1)) + action_flat = self.action_history.reshape((-1)) + return np.concatenate([state_flat, action_flat]) + + +# adds noise to velocity +class PerturbStateWrapper(gym.ObservationWrapper): + def __init__(self, env, perturb_std=0.02): + super().__init__(env) + self.std = perturb_std + + def observation(self, observation): + observation = np.atleast_2d(observation) + observation[:, 2:] = ( + observation[:, 2:] + np.random.randn(*observation[:, 2:].shape) * self.std + ) + return observation + + +# probably useless since it's already perturbed by SAC_main_training +class PerturbActionWrapper(gym.ActionWrapper): + def __init__(self, env, perturb_std=0.01): + super().__init__(env) + self.noise = lambda: np.random.randn(*env.observation_space.shape) * perturb_std + + def observation(self, observation): + return observation + self.noise() + + +class TimeAwareWrapper(gym.ObservationWrapper): + def __init__(self, env, dt): + super().__init__(env) + self.dt = dt + observation_space = env.observation_space + low = np.append(observation_space.low, 0.0) + high = np.append(observation_space.high, 1.0) + self.observation_space = gym.spaces.Box(low, high, dtype=np.float32) + self.t = 0 + + def observation(self, observation): + return np.append(observation, self.t / 10) + + def step(self, action): + self.t += self.dt + return super().step(action) + + def reset(self, **kwargs): + self.t = 0 + return super().reset(**kwargs) diff --git a/examples/reinforcement_learning/evolsac/SNES_finetuning/environment.py b/examples/reinforcement_learning/evolsac/SNES_finetuning/environment.py new file mode 100644 index 00000000..fcf2bc5d --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SNES_finetuning/environment.py @@ -0,0 +1,109 @@ +import gymnasium as gym +import numpy as np + +class CustomCustomEnv(gym.Env): + def __init__( + self, + dynamics_func, + reset_func, + obs_space=gym.spaces.Box( + np.array([-1.0, -1.0, -1.0, -1.0]), np.array([1.0, 1.0, 1.0, 1.0]) + ), + act_space=gym.spaces.Box(np.array([-1.0, -1.0]), np.array([1.0, 1.0])), + max_episode_steps=1000, + scaling=True, + terminates=True, + ): + self.dynamics_func = dynamics_func + self.reset_func = reset_func + self.observation_space = obs_space + self.action_space = act_space + self.max_episode_steps = max_episode_steps + + self.previous_action = 0 + self.terminates = terminates + + self.observation = self.reset_func() + self.step_counter = 0 + self.stabilisation_mode = False + self.y = [0,0] + self.update_y() + self.scaling = scaling + + l1 = self.dynamics_func.simulator.plant.l[0] + l2 = self.dynamics_func.simulator.plant.l[1] + self.max_height = l1 + l2 + + if self.dynamics_func.robot == "acrobot": + self.control_line = 0.75 * self.max_height + elif self.dynamics_func.robot == "pendubot": + self.control_line = 0.7 * self.max_height + + self.old_obs = None + + def step(self, action): + self.old_obs = np.copy(self.observation) + self.observation = self.dynamics_func( + self.observation, action, scaling=self.scaling + ) + + self.update_y() + self.stabilisation_mode = self.y[1] >= self.control_line + terminated = self.terminated_func() + reward = self.reward_func(terminated, action) + info = {} + truncated = False + self.step_counter += 1 + if self.step_counter >= self.max_episode_steps: + truncated = True + self.previous_action = action[0] + return self.observation, reward, terminated, truncated, info + + def reset(self, seed=None, options=None): + super().reset(seed=seed) + self.observation = self.reset_func() + self.step_counter = 0 + info = {} + self.previous_action = 0 + self.stabilisation_mode = False + self.old_obs = np.copy(self.observation) + return self.observation, info + + def render(self, mode="human"): + pass + + def reward_func(self, terminated, action): + raise NotImplementedError("You have to define the reward function") + + def terminated_func(self): + if self.terminates: + # Checks if we're in stabilisation mode and the ee has fallen below the control line + if self.stabilisation_mode and self.y[1] < self.control_line: + return True + return False + + # Update the y coordinate of the first joint and the end effector + def update_y(self): + theta1, theta2, _, _ = self.dynamics_func.unscale_state(self.observation) + + link_end_points = self.dynamics_func.simulator.plant.forward_kinematics( + [theta1, theta2] + ) + self.y[0] = link_end_points[0][1] + self.y[1] = link_end_points[1][1] + + def gravitational_reward(self): + x = self.dynamics_func.unscale_state(self.observation) + V = self.dynamics_func.simulator.plant.potential_energy(x) + return V + + def V(self): + return self.gravitational_reward() + + def kinetic_reward(self): + x = self.dynamics_func.unscale_state(self.observation) + T = self.dynamics_func.simulator.plant.kinetic_energy(x) + return T + + def T(self): + return self.kinetic_reward() diff --git a/examples/reinforcement_learning/evolsac/SNES_finetuning/magic.py b/examples/reinforcement_learning/evolsac/SNES_finetuning/magic.py new file mode 100644 index 00000000..3888e9cd --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SNES_finetuning/magic.py @@ -0,0 +1,254 @@ +import os +import shutil + +import numpy as np +import pandas +from double_pendulum.analysis.leaderboard import leaderboard_scores +from double_pendulum.controller.evolsac.evolsac_controller import EvolSACController +from double_pendulum.model.model_parameters import model_parameters +from sim_controller import simulate_controller +from stable_baselines3.common.callbacks import BaseCallback + + +def load_controller(dynamics_func, model, window_size, include_time, evaluating): + name = "sac" + leaderboard_config = { + "csv_path": name + "/sim_swingup.csv", + "name": name, + "simple_name": "sac", + "short_description": "SAC for both swingup and stabilisation", + "readme_path": f"readmes/{name}.md", + "username": "MarcoCali0", + } + controller = EvolSACController( + model=model, + dynamics_func=dynamics_func, + window_size=window_size, + include_time=include_time, + evaluating=evaluating, + SNES_phase=True, + ) + controller.init() + return controller, leaderboard_config + + +def magic_score( + dynamics_func, + model, + folder, + folder_id, + window_size, + max_torque, + include_time, + index=None, + evaluating=True, +): + design = "design_C.1" + integrator = "runge_kutta" + dt = 0.002 + t0 = 0.0 + t_final = 10.0 + x0 = [0.0, 0.0, 0.0, 0.0] + goal = [np.pi, 0.0, 0.0, 0.0] + + if folder == "acrobot": + model_par_path = ( + "../../../../data/system_identification/identified_parameters/" + + design + + "/" + + "model_1.1" + + "/model_parameters.yml" + ) + mpar = model_parameters(filepath=model_par_path) + mpar.set_torque_limit([0.0, max_torque]) + + else: # robot = "pendubot" + design = "design_C.1" + model_par_path = ( + "../../../../data/system_identification/identified_parameters/" + + design + + "/" + + "model_1.1" + + "/model_parameters.yml" + ) + mpar = model_parameters(filepath=model_par_path) + mpar.set_torque_limit([max_torque, 0.0]) + + controller, leaderboard_config = load_controller( + dynamics_func, model, window_size, include_time, evaluating=evaluating + ) + + data_dir = f"data_{folder}/{folder_id}" + + if not os.path.exists(data_dir): + os.makedirs(data_dir) + + controller_name = f"evolsac_{folder}" + save_dir = ( + f"{data_dir}/{controller_name}" + if index is None + else f"{data_dir}/{controller_name}/{index}" + ) + if index is not None: + leaderboard_config["csv_path"] = f"{controller_name}/{index}/sim_swingup.csv" + save_to = os.path.join(save_dir, "leaderboard_entry.csv") + + simulate_controller( + controller, save_dir, mpar, dt, t_final, t0, x0, goal, integrator + ) + + conf = leaderboard_config + + conf["csv_path"] = os.path.join(data_dir, leaderboard_config["csv_path"]) + data_paths = {} + data_paths[leaderboard_config["name"]] = conf + + leaderboard_scores( + data_paths=data_paths, + save_to=save_to, + mpar=mpar, + # weights={"swingup_time": 0.5, "max_tau": 0.1, "energy": 0.0, "integ_tau": 0.4, "tau_cost": 0.0, "tau_smoothness": 0.0}, + weights={ + "swingup_time": 1.0, + "max_tau": 0.0, + "energy": 1.0, + "integ_tau": 0.0, + "tau_cost": 1.0, + "tau_smoothness": 1.0, + "velocity_cost": 1.0, + }, + normalize={ + "swingup_time": 20.0, + "max_tau": 1.0, # not used + "energy": 60.0, + "integ_tau": 1.0, # not used + "tau_cost": 20.0, + "tau_smoothness": 0.1, + "velocity_cost": 400, + }, + link_base="", + score_version="v2", + ) + df = pandas.read_csv(save_to) + score = np.array(df["RealAI Score"])[0] + print("RealAI Score = ", score) + return score + + +def copy_files(src_folder, dest_folder): + if not os.path.exists(dest_folder): + os.makedirs(dest_folder) + + for filename in os.listdir(src_folder): + src_file = os.path.join(src_folder, filename) + dest_file = os.path.join(dest_folder, filename) + + if os.path.isfile(src_file): + shutil.copy2(src_file, dest_file) + + +class MagicCallback(BaseCallback): + def __init__( + self, + path, + folder_id, + dynamics_func, + robot, + window_size, + max_torque, + include_time, + ): + super().__init__(False) + self.path = f"{path}{folder_id}" + if not os.path.exists(path): + os.makedirs(path, exist_ok=True) + self.best = -np.inf + self.folder_id = folder_id + self.dynamics_func = dynamics_func + self.robot = robot + self.window_size = window_size + self.max_torque = max_torque + self.include_time = include_time + + def _on_step(self) -> bool: + score = magic_score( + self.dynamics_func, + self.model, + self.robot, + self.folder_id, + self.window_size, + self.max_torque, + self.include_time, + ) + if score >= self.best: + self.best = score + self.model.save(f"{self.path}/best_model") + copy_files(f"./data/{self.folder_id}/evolsac/", self.path) + return True + +import concurrent.futures + + +class BruteMagicCallback(BaseCallback): + def __init__( + self, + path, + folder_id, + dynamics_func, + robot, + window_size, + max_torque, + include_time, + ): + super().__init__(False) + self.path = f"{path}{folder_id}" + if not os.path.exists(path): + os.makedirs(path, exist_ok=True) + self.folder_id = folder_id + self.dynamics_func = dynamics_func + self.robot = robot + self.window_size = window_size + self.max_torque = max_torque + self.include_time = include_time + self.iteration = 0 + self.executor = concurrent.futures.ThreadPoolExecutor() + + def _on_step(self) -> bool: + self.iteration += 1 + self.executor.submit(lambda: async_store(self.iteration, self)) + return True + + +import tempfile + +from stable_baselines3 import SAC + + +def deepcopy_model(model): + with tempfile.TemporaryDirectory() as tmpdirname: + model_path = os.path.join(tmpdirname, "temp_model") + model.save(model_path) + copied_model = SAC.load(model_path) + return copied_model + + + +def async_store(iteration, callback: BruteMagicCallback): + model = deepcopy_model(callback.model) + _ = magic_score( + callback.dynamics_func, + model, + callback.robot, + callback.folder_id, + callback.window_size, + callback.max_torque, + callback.include_time, + index=iteration, + ) + if not os.path.exists(f"{callback.path}/{iteration}"): + os.makedirs(f"{callback.path}/{iteration}", exist_ok=True) + model.save(f"{callback.path}/{iteration}/best_model") + copy_files( + f"./data/{callback.folder_id}/evolsac/{iteration}", + f"{os.path.join(callback.path, str(iteration))}", + ) diff --git a/examples/reinforcement_learning/evolsac/SNES_finetuning/main.py b/examples/reinforcement_learning/evolsac/SNES_finetuning/main.py new file mode 100644 index 00000000..c97f7ebd --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SNES_finetuning/main.py @@ -0,0 +1,337 @@ +import os +import sys + +import gymnasium as gym +import numpy as np +import stable_baselines3 +import torch +from environment import CustomCustomEnv +from magic import MagicCallback, BruteMagicCallback +from simulator import CustomSimulator +from stable_baselines3 import SAC +from stable_baselines3.common.callbacks import EvalCallback +from stable_baselines3.common.env_util import make_vec_env +from stable_baselines3.sac.policies import MlpPolicy +from wrappers import * + +from double_pendulum.model.model_parameters import model_parameters +from double_pendulum.model.symbolic_plant import SymbolicDoublePendulum +from double_pendulum.simulation.gym_env import double_pendulum_dynamics_func +from double_pendulum.simulation.simulation import Simulator + +np.random.seed(0) +torch.manual_seed(0) +torch.random.manual_seed(0) +torch.backends.cudnn.deterministic = True +stable_baselines3.common.utils.set_random_seed(0) + + +class MyEnv(CustomCustomEnv): + def reward_func(self, terminated, action): + _, theta2, omega1, omega2 = self.dynamics_func.unscale_state(self.observation) + costheta2 = np.cos(theta2) + + a = action[0] + delta_action = np.abs(a - self.previous_action) + lambda_delta = 0.05 + lambda_action = 0.02 + lambda_velocities = 0.00005 + if not terminated: + if self.stabilisation_mode: + reward = ( + self.V() + + 2 * (1 + costheta2) ** 2 + - self.T() + - 5 * lambda_action * np.square(a) + - 3 * lambda_delta * delta_action + ) + else: + reward = ( + (1 - np.abs(a)) * self.V() + - lambda_action * np.square(a) + - 2 * lambda_velocities * (omega1**2 + omega2**2) + - 3 * lambda_delta * delta_action + ) + else: + reward = -1.0 + return reward + + +assert ( + len(sys.argv) >= 3 +), "Please provide: [max torque] [robustness] [window_size (0 = no window)] [include_time]" +max_torque = float(sys.argv[1]) +robustness = float(sys.argv[2]) +WINDOW_SIZE = int(sys.argv[3]) +INCLUDE_TIME = bool(int(sys.argv[4])) +robot = str(sys.argv[5]) +FOLDER_ID = f"{os.path.basename(__file__)}-{max_torque}-{robustness}-{WINDOW_SIZE}-{int(INCLUDE_TIME)}" +TERMINATION = False + +# setting log path for the training +log_dir = f"./log_data_{robot}/SAC_training/{FOLDER_ID}" +if not os.path.exists(log_dir): + os.makedirs(log_dir) + +design = "design_C.1" +model = "model_1.1" +model_par_path = ( + "../../../../data/system_identification/identified_parameters/" + + design + + "/" + + model + + "/model_parameters.yml" +) + +# model and reward parameter +max_velocity = 50 +torque_limit = [max_torque, 0] if robot == "pendubot" else [0, max_torque] + +mpar = model_parameters(filepath=model_par_path) +mpar.set_torque_limit(torque_limit) +dt = 0.01 +integrator = "runge_kutta" + +plant = SymbolicDoublePendulum(model_pars=mpar) +simulator = CustomSimulator(plant=plant, robustness=robustness, max_torque=max_torque, robot=robot) +eval_simulator = Simulator(plant=plant) + +# learning environment parameters +state_representation = 2 + +obs_space = gym.spaces.Box(np.array([-1.0] * 4), np.array([1.0] * 4)) +act_space = gym.spaces.Box(np.array([-1]), np.array([1])) +max_steps = 10 / dt + +############################################################################### + +n_envs = 1 +training_steps = 30_000_000_000_000 +verbose = 1 +eval_freq = 10_000 +n_eval_episodes = 1 +# a patto che i reward istantanei siano piccoli +# 0.01 -> 1500000 -> 7 +# 0.003 -> 1500000 -> 46 +# 0.001 -> 1500000 -> 38 +# 0.0003 -> 1500000 -> 19 +learning_rate = 0.001 + +############################################################################### + +# initialize double pendulum dynamics +dynamics_func = double_pendulum_dynamics_func( + simulator=simulator, + dt=dt, + integrator=integrator, + robot=robot, + state_representation=state_representation, + max_velocity=max_velocity, + torque_limit=torque_limit, +) + +eval_dynamics_func = double_pendulum_dynamics_func( + simulator=eval_simulator, + dt=dt, + integrator=integrator, + robot=robot, + state_representation=state_representation, + max_velocity=max_velocity, + torque_limit=torque_limit, +) + + +def zero_reset_func(): + observation = [-1.0, -1.0, 0.0, 0.0] + return observation + + +############################################################################### + + +def wrap(env): + if INCLUDE_TIME: + env = TimeAwareWrapper(env, dt=dt) + if WINDOW_SIZE > 0: + env = StateActionHistoryWrapper(env, history_length=WINDOW_SIZE) + return env + + +envs = make_vec_env( + env_id=MyEnv, + wrapper_class=wrap, + n_envs=n_envs, + env_kwargs={ + "dynamics_func": dynamics_func, + "reset_func": zero_reset_func, + "terminates": TERMINATION, + "obs_space": obs_space, + "act_space": act_space, + "max_episode_steps": max_steps, + }, +) + +eval_env = wrap( + MyEnv( + dynamics_func=eval_dynamics_func, + reset_func=zero_reset_func, + obs_space=obs_space, + act_space=act_space, + max_episode_steps=max_steps, + terminates=TERMINATION, + ) +) + +eval_callback = EvalCallback( + eval_env, + callback_after_eval=MagicCallback( + f"./models_{robot}/", + folder_id=FOLDER_ID, + dynamics_func=eval_dynamics_func, + robot=robot, + window_size=WINDOW_SIZE, + max_torque=max_torque, + include_time=INCLUDE_TIME, + ), + best_model_save_path=os.path.join(log_dir, "best_model"), + log_path=log_dir, + eval_freq=eval_freq, + verbose=verbose, + n_eval_episodes=n_eval_episodes, +) + +############################################################################### + +# agent = SAC( +# MlpPolicy, +# envs, +# verbose=verbose, +# learning_rate=learning_rate, +# ) + +# agent.learn(total_timesteps=training_steps, callback=eval_callback) + + +REFERENCE_AGENT = SAC( + MlpPolicy, + envs, + verbose=verbose, + learning_rate=learning_rate, +) +reference_agent_ = "/home/alberto_sinigaglia/test_dp_clone/leaderboard/pendubot/pendubot_best_model_0567_0800.zip" if robot == "pendubot" else "/home/alberto_sinigaglia/test_dp_clone/leaderboard/acrobot/acrobot_best_model_0504_0700.zip" +REFERENCE_AGENT.set_parameters(reference_agent_) + +from evotorch import Problem +from evotorch.algorithms.distributed.gaussian import SNES +from magic import deepcopy_model, magic_score + +WORKERS = 40 +i = 0 + +import uuid +def simulate(policy_params): + global i + i += 1 + IDX = i + agent = deepcopy_model(REFERENCE_AGENT) + with torch.no_grad(): + state_dict = agent.policy.actor.latent_pi.state_dict() + keys = list(state_dict.keys()) + split_sizes = [torch.numel(state_dict[key]) for key in keys] + params_split = torch.split(policy_params.clone().detach(), split_sizes) + state_dict.update( + { + key: param.reshape(state_dict[key].shape) + for key, param in zip(keys, params_split) + } + ) + agent.policy.actor.latent_pi.load_state_dict(state_dict) + + index = uuid.uuid4().hex + score = np.mean([ + magic_score( + dynamics_func, + agent, + robot, + "./result", + 0, + max_torque, + 0, + index=index, + evaluating=False + ) + for _ in range(2) + ]) + return score if not np.isnan(score) else 0.0 + + +# Set up the EvoTorch problem +problem = Problem( + "max", + simulate, + solution_length=len( + torch.cat( + [ + p.data.view(-1) + for p in REFERENCE_AGENT.policy.actor.latent_pi.parameters() + ] + ) + ), + num_actors=WORKERS, +) + +initial_solution = np.concatenate( + [ + p.data.cpu().numpy().flatten() + for p in REFERENCE_AGENT.policy.actor.latent_pi.parameters() + ] +) + +optimizer = SNES(problem, popsize=WORKERS, center_init=initial_solution, stdev_init=0.0075) +for generation in range(1000): + optimizer.step() + print( + f"Generation {generation}: Best reward so far: {optimizer.status['best'].evals}" + ) + + best_params = optimizer.status["best"].values + score = optimizer.status["best_eval"] + + # Update the policy's parameters + with torch.no_grad(): + state_dict = REFERENCE_AGENT.policy.actor.latent_pi.state_dict() + keys = list(state_dict.keys()) + split_sizes = [torch.numel(state_dict[key]) for key in keys] + params_split = torch.split(best_params.clone().detach(), split_sizes) + state_dict.update( + { + key: param.reshape(state_dict[key].shape) + for key, param in zip(keys, params_split) + } + ) + REFERENCE_AGENT.policy.actor.latent_pi.load_state_dict(state_dict) + + + real_score = magic_score( + dynamics_func, + REFERENCE_AGENT, + robot, + "./result", + 0, + max_torque, + 0, + index=f"TEST-{max_torque}-{generation}", + ) + + + + # Save the model with the best parameters + import os + + if not os.path.exists(f"savings_{max_torque}"): + os.makedirs(f"savings_{max_torque}", exist_ok=True) + REFERENCE_AGENT.save(f"savings_{max_torque}/{generation}/best_model-{score}-{real_score}.zip") + torch.save( + REFERENCE_AGENT.policy.actor.latent_pi.state_dict(), + f"savings_{max_torque}/{generation}/optimized_policy-{score}-{real_score}.pth", + ) diff --git a/examples/reinforcement_learning/evolsac/SNES_finetuning/sim_controller.py b/examples/reinforcement_learning/evolsac/SNES_finetuning/sim_controller.py new file mode 100644 index 00000000..639ce76f --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SNES_finetuning/sim_controller.py @@ -0,0 +1,54 @@ +import os +from double_pendulum.model.symbolic_plant import SymbolicDoublePendulum +from double_pendulum.simulation.simulation import Simulator +from double_pendulum.utils.csv_trajectory import save_trajectory + + +def simulate_controller(controller, save_dir, mpar, dt, t_final, t0, x0, goal, integrator, controller_name=""): + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + plant = SymbolicDoublePendulum(model_pars=mpar) + sim = Simulator(plant=plant) + + # T, X, U = sim.simulate_and_animate( + # t0=t0, + # x0=x0, + # tf=t_final, + # dt=dt, + # controller=controller, + # integrator=integrator, + # save_video=True, + # video_name=os.path.join(save_dir, "sim_video.gif"), + # plot_horizontal_line=True, + # horizontal_line_height=0.9 * (mpar.l[0] + mpar.l[1]), + # scale=0.25, + # ) + + T, X, U = sim.simulate( + t0=t0, + x0=x0, + tf=t_final, + dt=dt, + controller=controller, + integrator=integrator, + ) + + save_trajectory(os.path.join(save_dir, "sim_swingup.csv"), T=T, X_meas=X, U_con=U) + + # plot_timeseries( + # T, + # X, + # U, + # X_meas=sim.meas_x_values, + # pos_y_lines=[-np.pi, 0.0, np.pi], + # vel_y_lines=[0.0], + # tau_y_lines=[-mpar.tl[1], 0.0, mpar.tl[1]], + # save_to=os.path.join(save_dir, "timeseries"), + # show=False, + # scale=0.5, + # ) + + if os.path.exists(f"readmes/{controller_name}.md"): + os.system(f"cp readmes/{controller_name}.md {save_dir}/README.md") + diff --git a/examples/reinforcement_learning/evolsac/SNES_finetuning/simulator.py b/examples/reinforcement_learning/evolsac/SNES_finetuning/simulator.py new file mode 100644 index 00000000..193fb1e5 --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SNES_finetuning/simulator.py @@ -0,0 +1,128 @@ +from copy import deepcopy + +import numpy as np + +from double_pendulum.model.model_parameters import model_parameters +from double_pendulum.simulation.simulation import Simulator + + +def get_par_list(x0, min_rel, max_rel, n): + if x0 != 0: + if n % 2 == 0: + n = n + 1 + li = np.linspace(min_rel, max_rel, n) + else: + li = np.linspace(0, max_rel, n) + par_list = li * x0 + return par_list + + +class CustomSimulator(Simulator): + def __init__(self, plant, robustness, max_torque, robot): + self.base_plant = deepcopy(plant) + design = "design_C.1" + model = "model_1.1" + + model_par_path = ( + "../../../../data/system_identification/identified_parameters/" + + design + + "/" + + model + + "/model_parameters.yml" + ) + mpar = model_parameters(filepath=model_par_path) + torque_limit = [max_torque, 0] if robot == "pendubot" else [0, max_torque] + mpar.set_torque_limit(torque_limit) + + self.mpar = mpar + self.robustness = robustness + self.max_torque = max_torque + super().__init__(plant=plant) + + def reset(self): + """ + Reset the Simulator + Resets + - the internal data recorder + - the filter + arguments + - the process noise + - the measurement parameters + - the motor parameters + - perturbations + """ + super().reset() + + self.plant = deepcopy(self.base_plant) + + if np.random.rand() < self.robustness: + N_var = 21 + mpar_vars = [ + "Ir", + "m1r1", + "I1", + "b1", + "cf1", + "m2r2", + "m2", + "I2", + "b2", + "cf2", + ] + + Ir_var_list = np.linspace(0.0, 1e-4, N_var) + m1r1_var_list = get_par_list( + self.mpar.m[0] * self.mpar.r[0], 0.75, 1.25, N_var + ) + I1_var_list = get_par_list(self.mpar.I[0], 0.75, 1.25, N_var) + b1_var_list = np.linspace(-0.1, 0.1, N_var) + cf1_var_list = np.linspace(-0.2, 0.2, N_var) + m2r2_var_list = get_par_list( + self.mpar.m[1] * self.mpar.r[1], 0.75, 1.25, N_var + ) + m2_var_list = get_par_list(self.mpar.m[1], 0.75, 1.25, N_var) + I2_var_list = get_par_list(self.mpar.I[1], 0.75, 1.25, N_var) + b2_var_list = np.linspace(-0.1, 0.1, N_var) + cf2_var_list = np.linspace(-0.2, 0.2, N_var) + + modelpar_var_lists = { + "Ir": Ir_var_list, + "m1r1": m1r1_var_list, + "I1": I1_var_list, + "b1": b1_var_list, + "cf1": cf1_var_list, + "m2r2": m2r2_var_list, + "m2": m2_var_list, + "I2": I2_var_list, + "b2": b2_var_list, + "cf2": cf2_var_list, + } + + mp = np.random.choice(mpar_vars) + var = np.random.choice(modelpar_var_lists[mp]) + + if mp == "Ir": + self.plant.Ir = var + elif mp == "m1r1": + m1 = self.mpar.m[0] + r1 = var / m1 + self.plant.m[0] = m1 + self.plant.com[0] = r1 + elif mp == "I1": + self.plant.I[0] = var + elif mp == "b1": + self.plant.b[0] = var + elif mp == "cf1": + self.plant.coulomb_fric[0] = var + elif mp == "m2r2": + m2 = self.mpar.m[1] + r2 = var / m2 + self.plant.m[1] = m2 + self.plant.com[1] = r2 + elif mp == "m2": + self.plant.m[1] = var + elif mp == "I2": + self.plant.I[1] = var + elif mp == "b2": + self.plant.b[1] = var + elif mp == "cf2": + self.plant.coulomb_fric[1] = var diff --git a/examples/reinforcement_learning/evolsac/SNES_finetuning/wrappers.py b/examples/reinforcement_learning/evolsac/SNES_finetuning/wrappers.py new file mode 100644 index 00000000..51535c97 --- /dev/null +++ b/examples/reinforcement_learning/evolsac/SNES_finetuning/wrappers.py @@ -0,0 +1,87 @@ +import gymnasium as gym +from collections import deque +import numpy as np + + +class StateActionHistoryWrapper(gym.Wrapper): + def __init__(self, env, history_length=4): + super().__init__(env) + self.history_length = history_length + self.state_history = np.zeros((history_length, *env.observation_space.shape)) + self.action_history = np.zeros((history_length, *env.action_space.shape)) + + state_space = env.observation_space + action_space = env.action_space + low = np.concatenate( + [state_space.low] * history_length + [action_space.low] * history_length + ) + high = np.concatenate( + [state_space.high] * history_length + [action_space.high] * history_length + ) + self.observation_space = gym.spaces.Box(low=low, high=high, dtype=np.float32) + + def reset(self, **kwargs): + observation = self.env.reset() + self.state_history.fill(0) + self.state_history[-1] = observation[0] + self.action_history.fill(0) + return self._get_observation(), {} + + def step(self, action): + observation, reward, terminated, truncated, info = self.env.step(action) + self.state_history[0:-1] = self.state_history[1:] + self.state_history[-1] = observation + self.action_history[0:-1] = self.action_history[1:] + self.action_history[-1] = action + return self._get_observation(), reward, terminated, truncated, info + + def _get_observation(self): + state_flat = self.state_history.reshape((-1)) + action_flat = self.action_history.reshape((-1)) + return np.concatenate([state_flat, action_flat]) + + +# adds noise to velocity +class PerturbStateWrapper(gym.ObservationWrapper): + def __init__(self, env, perturb_std=0.02): + super().__init__(env) + self.std = perturb_std + + def observation(self, observation): + observation = np.atleast_2d(observation) + observation[:, 2:] = ( + observation[:, 2:] + np.random.randn(*observation[:, 2:].shape) * self.std + ) + return observation + + +# probably useless since it's already perturbed by SAC +class PerturbActionWrapper(gym.ActionWrapper): + def __init__(self, env, perturb_std=0.01): + super().__init__(env) + self.noise = lambda: np.random.randn(*env.observation_space.shape) * perturb_std + + def observation(self, observation): + return observation + self.noise() + + +class TimeAwareWrapper(gym.ObservationWrapper): + def __init__(self, env, dt): + super().__init__(env) + self.dt = dt + observation_space = env.observation_space + low = np.append(observation_space.low, 0.0) + high = np.append(observation_space.high, 1.0) + self.observation_space = gym.spaces.Box(low, high, dtype=np.float32) + self.t = 0 + + def observation(self, observation): + return np.append(observation, self.t / 10) + + def step(self, action): + self.t += self.dt + return super().step(action) + + def reset(self, **kwargs): + self.t = 0 + return super().reset(**kwargs) diff --git a/experiments/design_C.1/saclqr.py b/experiments/design_C.1/saclqr.py index bac86bae..0bef0c23 100644 --- a/experiments/design_C.1/saclqr.py +++ b/experiments/design_C.1/saclqr.py @@ -29,14 +29,14 @@ active_act = 0 scaling = False load_path = "../data/controller_parameters/design_C.1/model_1.1/pendubot/lqr" - model_path = "../data/policies/design_C.1/model_1.0/pendubot/SAC/best_model.zip" + model_path = "../data/policies/design_C.1/model_1.0/pendubot/SAC_main_training/best_model.zip" elif robot == "acrobot": torque_limit = [0.5, 5.0] active_act = 1 scaling = True load_path = "../data/controller_parameters/design_C.1/model_1.1/acrobot/lqr" - model_path = "../data/policies/design_C.1/model_1.0/acrobot/SAC/sac_model.zip" + model_path = "../data/policies/design_C.1/model_1.0/acrobot/SAC_main_training/sac_model.zip" ## set model and controller parameters model_par_path = ( diff --git a/leaderboard/acrobot/robustness_v2/con_evolsac.py b/leaderboard/acrobot/robustness_v2/con_evolsac.py new file mode 100644 index 00000000..1002978e --- /dev/null +++ b/leaderboard/acrobot/robustness_v2/con_evolsac.py @@ -0,0 +1,62 @@ +import stable_baselines3 +from double_pendulum.controller.evolsac.evolsac_controller import EvolSACController +from double_pendulum.model.symbolic_plant import SymbolicDoublePendulum +from double_pendulum.simulation.gym_env import double_pendulum_dynamics_func +from double_pendulum.simulation.simulation import Simulator +from sim_parameters import design, integrator, model, mpar, robot +from stable_baselines3 import SAC + +assert stable_baselines3.__version__ == "2.3.2" + +name = "evolsac" +leaderboard_config = { + "csv_path": name + "/sim_swingup.csv", + "name": name, + "simple_name": "evolsac", + "short_description": "Evolutionary SAC for both swingup and stabilisation", + "readme_path": f"readmes/{name}.md", + "username": "AlbertoSinigaglia", +} + +# All of the variables below are now imported from sim_parameters.py +# robot = "acrobot" +# design = "design_C.1" +# model = "model_1.0" +# mpar = model_parameters(filepath=model_par_path) +# mpar.set_torque_limit(torque_limit) + +# integrator = "runge_kutta" + +max_torque = 3.0 +max_velocity = 50.0 +torque_limit = [max_torque, 0.0] if robot == "pendubot" else [0.0, max_torque] +mpar.set_torque_limit(torque_limit) + +print(f"Loading {robot} trained model...") + +model_path = "../../../data/policies/design_C.1/model_1.0/acrobot/evolsac/model.zip" +# Simulation parameters +dt = 0.01 + +# learning environment parameters +state_representation = 2 + +plant = SymbolicDoublePendulum(model_pars=mpar) +print("Loading model parameters...") +simulator = Simulator(plant=plant) + +dynamics_func = double_pendulum_dynamics_func( + simulator=simulator, + dt=dt, + integrator=integrator, + robot=robot, + state_representation=state_representation, + max_velocity=max_velocity, + torque_limit=torque_limit, +) + +sac_model = SAC.load(model_path) +controller = EvolSACController( + model=sac_model, dynamics_func=dynamics_func, window_size=0, include_time=False +) +controller.init() diff --git a/leaderboard/acrobot/robustness_v2/readmes/evolsac.md b/leaderboard/acrobot/robustness_v2/readmes/evolsac.md new file mode 100644 index 00000000..06d0c201 --- /dev/null +++ b/leaderboard/acrobot/robustness_v2/readmes/evolsac.md @@ -0,0 +1,11 @@ +# Evolutionary SAC + +## Trajectory Learning, Optimization and Stabilization + +This controller uses a trajectory dictated by the policy learned in the following way: + + 1. SAC training with loose surrogate reward + 2. SAC training with stricter surrogate reward + 3. SNES training with challenge reward + injected noise in the action + +The controller uses the final policy network diff --git a/leaderboard/acrobot/robustness_v2/sim_parameters.py b/leaderboard/acrobot/robustness_v2/sim_parameters.py index aa6e2142..ef0b6407 100644 --- a/leaderboard/acrobot/robustness_v2/sim_parameters.py +++ b/leaderboard/acrobot/robustness_v2/sim_parameters.py @@ -15,7 +15,6 @@ ) mpar = model_parameters(filepath=model_par_path) mpar.set_torque_limit([0.0, 6.0]) -# print(mpar) mpar.set_motor_inertia(0.0) mpar.set_damping([0.0, 0.0]) mpar.set_cfric([0.0, 0.0]) @@ -26,4 +25,4 @@ t0 = 0.0 t_final = 10.0 x0 = [0.0, 0.0, 0.0, 0.0] -goal = [np.pi, 0.0, 0.0, 0.0] +goal = [np.pi, 0.0, 0.0, 0.0] \ No newline at end of file diff --git a/leaderboard/acrobot/simulation_v2/con_evolsac.py b/leaderboard/acrobot/simulation_v2/con_evolsac.py new file mode 100644 index 00000000..efa0afcf --- /dev/null +++ b/leaderboard/acrobot/simulation_v2/con_evolsac.py @@ -0,0 +1,60 @@ +import stable_baselines3 +from double_pendulum.controller.evolsac.evolsac_controller import EvolSACController +from double_pendulum.model.symbolic_plant import SymbolicDoublePendulum +from double_pendulum.simulation.gym_env import double_pendulum_dynamics_func +from double_pendulum.simulation.simulation import Simulator +from sim_parameters import design, integrator, model, mpar, robot +from stable_baselines3 import SAC + +assert stable_baselines3.__version__ == "2.3.2" + +name = "evolsac" +leaderboard_config = { + "csv_path": name + "/sim_swingup.csv", + "name": name, + "simple_name": "evolsac", + "short_description": "Evolutionary SAC for both swingup and stabilisation", + "readme_path": f"readmes/{name}.md", + "username": "AlbertoSinigaglia", +} + +# All of the variables below are now imported from sim_parameters.py +# robot = "acrobot" +# design = "design_C.1" +# model = "model_1.0" + +# integrator = "runge_kutta" + +max_torque = 3.0 +max_velocity = 50.0 +torque_limit = [max_torque, 0.0] if robot == "pendubot" else [0.0, max_torque] +mpar.set_torque_limit(torque_limit) + +print(f"Loading {robot} trained model...") + +model_path = "../../../data/policies/design_C.1/model_1.0/acrobot/evolsac/model.zip" +# Simulation parameters +dt = 0.01 + +# learning environment parameters +state_representation = 2 + +plant = SymbolicDoublePendulum(model_pars=mpar) +print("Loading model parameters...") +simulator = Simulator(plant=plant) + +dynamics_func = double_pendulum_dynamics_func( + simulator=simulator, + dt=dt, + integrator=integrator, + robot=robot, + state_representation=state_representation, + max_velocity=max_velocity, + torque_limit=torque_limit, +) + +sac_model = SAC.load(model_path) +controller = EvolSACController( + model=sac_model, dynamics_func=dynamics_func, window_size=0, include_time=False +) +controller.init() diff --git a/leaderboard/acrobot/simulation_v2/readmes/evolsac.md b/leaderboard/acrobot/simulation_v2/readmes/evolsac.md new file mode 100644 index 00000000..06d0c201 --- /dev/null +++ b/leaderboard/acrobot/simulation_v2/readmes/evolsac.md @@ -0,0 +1,11 @@ +# Evolutionary SAC + +## Trajectory Learning, Optimization and Stabilization + +This controller uses a trajectory dictated by the policy learned in the following way: + + 1. SAC training with loose surrogate reward + 2. SAC training with stricter surrogate reward + 3. SNES training with challenge reward + injected noise in the action + +The controller uses the final policy network diff --git a/leaderboard/acrobot/simulation_v2/sim_parameters.py b/leaderboard/acrobot/simulation_v2/sim_parameters.py index cb3bfdc1..ef0b6407 100644 --- a/leaderboard/acrobot/simulation_v2/sim_parameters.py +++ b/leaderboard/acrobot/simulation_v2/sim_parameters.py @@ -25,4 +25,4 @@ t0 = 0.0 t_final = 10.0 x0 = [0.0, 0.0, 0.0, 0.0] -goal = [np.pi, 0.0, 0.0, 0.0] +goal = [np.pi, 0.0, 0.0, 0.0] \ No newline at end of file diff --git a/leaderboard/pendubot/robustness_v2/con_evolsac.py b/leaderboard/pendubot/robustness_v2/con_evolsac.py new file mode 100644 index 00000000..3b18050a --- /dev/null +++ b/leaderboard/pendubot/robustness_v2/con_evolsac.py @@ -0,0 +1,63 @@ +import stable_baselines3 +from double_pendulum.controller.evolsac.evolsac_controller import EvolSACController +from double_pendulum.model.symbolic_plant import SymbolicDoublePendulum +from double_pendulum.simulation.gym_env import double_pendulum_dynamics_func +from double_pendulum.simulation.simulation import Simulator +from sim_parameters import design, integrator, model, mpar, robot +from stable_baselines3 import SAC + +assert stable_baselines3.__version__ == "2.3.2" + +name = "evolsac" +leaderboard_config = { + "csv_path": name + "/sim_swingup.csv", + "name": name, + "simple_name": "evolsac", + "short_description": "Evolutionary SAC for both swingup and stabilisation", + "readme_path": f"readmes/{name}.md", + "username": "AlbertoSinigaglia", +} + +# All of the variables below are now imported from sim_parameters.py +# robot = "pendubot" +# design = "design_C.1" +# model = "model_1.0" +# mpar = model_parameters(filepath=model_par_path) +# mpar.set_torque_limit(torque_limit) + +# integrator = "runge_kutta" + +max_torque = 3.0 +max_velocity = 50.0 +torque_limit = [max_torque, 0.0] if robot == "pendubot" else [0.0, max_torque] +mpar.set_torque_limit(torque_limit) + +print(f"Loading {robot} trained model...") + +model_path = "../../../data/policies/design_C.1/model_1.0/pendubot/evolsac/model.zip" + +# Simulation parameters +dt = 0.01 + +# learning environment parameters +state_representation = 2 + +plant = SymbolicDoublePendulum(model_pars=mpar) +print("Loading model parameters...") +simulator = Simulator(plant=plant) + +dynamics_func = double_pendulum_dynamics_func( + simulator=simulator, + dt=dt, + integrator=integrator, + robot=robot, + state_representation=state_representation, + max_velocity=max_velocity, + torque_limit=torque_limit, +) + +sac_model = SAC.load(model_path) +controller = EvolSACController( + model=sac_model, dynamics_func=dynamics_func, window_size=0, include_time=False +) +controller.init() diff --git a/leaderboard/pendubot/robustness_v2/readmes/evolsac.md b/leaderboard/pendubot/robustness_v2/readmes/evolsac.md new file mode 100644 index 00000000..06d0c201 --- /dev/null +++ b/leaderboard/pendubot/robustness_v2/readmes/evolsac.md @@ -0,0 +1,11 @@ +# Evolutionary SAC + +## Trajectory Learning, Optimization and Stabilization + +This controller uses a trajectory dictated by the policy learned in the following way: + + 1. SAC training with loose surrogate reward + 2. SAC training with stricter surrogate reward + 3. SNES training with challenge reward + injected noise in the action + +The controller uses the final policy network diff --git a/leaderboard/pendubot/robustness_v2/sim_parameters.py b/leaderboard/pendubot/robustness_v2/sim_parameters.py index 5743e83d..d834c78a 100644 --- a/leaderboard/pendubot/robustness_v2/sim_parameters.py +++ b/leaderboard/pendubot/robustness_v2/sim_parameters.py @@ -15,7 +15,6 @@ ) mpar = model_parameters(filepath=model_par_path) mpar.set_torque_limit([6.0, 0.0]) -# print(mpar) mpar.set_motor_inertia(0.0) mpar.set_damping([0.0, 0.0]) mpar.set_cfric([0.0, 0.0]) @@ -26,4 +25,4 @@ t0 = 0.0 t_final = 10.0 x0 = [0.0, 0.0, 0.0, 0.0] -goal = [np.pi, 0.0, 0.0, 0.0] +goal = [np.pi, 0.0, 0.0, 0.0] \ No newline at end of file diff --git a/leaderboard/pendubot/simulation_v2/con_evolsac.py b/leaderboard/pendubot/simulation_v2/con_evolsac.py new file mode 100644 index 00000000..47840d68 --- /dev/null +++ b/leaderboard/pendubot/simulation_v2/con_evolsac.py @@ -0,0 +1,62 @@ +import stable_baselines3 +from double_pendulum.controller.evolsac.evolsac_controller import EvolSACController +from double_pendulum.model.symbolic_plant import SymbolicDoublePendulum +from double_pendulum.simulation.gym_env import double_pendulum_dynamics_func +from double_pendulum.simulation.simulation import Simulator +from sim_parameters import design, integrator, model, mpar, robot +from stable_baselines3 import SAC + +assert stable_baselines3.__version__ == "2.3.2" + +name = "evolsac" +leaderboard_config = { + "csv_path": name + "/sim_swingup.csv", + "name": name, + "simple_name": "evolsac", + "short_description": "Evolutionary SAC for both swingup and stabilisation", + "readme_path": f"readmes/{name}.md", + "username": "AlbertoSinigaglia", +} + +# All of the variables below are now imported from sim_parameters.py +# robot = "pendubot" +# design = "design_C.1" +# model = "model_1.0" +# mpar = model_parameters(filepath=model_par_path) +# mpar.set_torque_limit(torque_limit) + +# integrator = "runge_kutta" + +max_torque = 3.0 +max_velocity = 50.0 +torque_limit = [max_torque, 0.0] if robot == "pendubot" else [0.0, max_torque] +mpar.set_torque_limit(torque_limit) + +print(f"Loading {robot} trained model...") + +model_path = "../../../data/policies/design_C.1/model_1.0/pendubot/evolsac/model.zip" +# Simulation parameters +dt = 0.01 + +# learning environment parameters +state_representation = 2 + +plant = SymbolicDoublePendulum(model_pars=mpar) +print("Loading model parameters...") +simulator = Simulator(plant=plant) + +dynamics_func = double_pendulum_dynamics_func( + simulator=simulator, + dt=dt, + integrator=integrator, + robot=robot, + state_representation=state_representation, + max_velocity=max_velocity, + torque_limit=torque_limit, +) + +sac_model = SAC.load(model_path) +controller = EvolSACController( + model=sac_model, dynamics_func=dynamics_func, window_size=0, include_time=False +) +controller.init() diff --git a/leaderboard/pendubot/simulation_v2/readmes/evolsac.md b/leaderboard/pendubot/simulation_v2/readmes/evolsac.md new file mode 100644 index 00000000..06d0c201 --- /dev/null +++ b/leaderboard/pendubot/simulation_v2/readmes/evolsac.md @@ -0,0 +1,11 @@ +# Evolutionary SAC + +## Trajectory Learning, Optimization and Stabilization + +This controller uses a trajectory dictated by the policy learned in the following way: + + 1. SAC training with loose surrogate reward + 2. SAC training with stricter surrogate reward + 3. SNES training with challenge reward + injected noise in the action + +The controller uses the final policy network diff --git a/leaderboard/pendubot/simulation_v2/sim_parameters.py b/leaderboard/pendubot/simulation_v2/sim_parameters.py index de357f10..d834c78a 100644 --- a/leaderboard/pendubot/simulation_v2/sim_parameters.py +++ b/leaderboard/pendubot/simulation_v2/sim_parameters.py @@ -25,4 +25,4 @@ t0 = 0.0 t_final = 10.0 x0 = [0.0, 0.0, 0.0, 0.0] -goal = [np.pi, 0.0, 0.0, 0.0] +goal = [np.pi, 0.0, 0.0, 0.0] \ No newline at end of file diff --git a/src/python/double_pendulum/controller/evolsac/__init__.py b/src/python/double_pendulum/controller/evolsac/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/python/double_pendulum/controller/evolsac/evolsac_controller.py b/src/python/double_pendulum/controller/evolsac/evolsac_controller.py new file mode 100644 index 00000000..93c7d0ab --- /dev/null +++ b/src/python/double_pendulum/controller/evolsac/evolsac_controller.py @@ -0,0 +1,66 @@ +import numpy as np +import torch +from double_pendulum.controller.abstract_controller import AbstractController + + +class EvolSACController(AbstractController): + def __init__( + self, + model, + dynamics_func, + scaling=True, + include_time=True, + window_size=0, + SNES_phase=False, + evaluating=True, + ): + super().__init__() + self.model = model + self.dynamics_func = dynamics_func + self.scaling = scaling + self.window_size = window_size + self.include_time = include_time + self.old_state = [ + [0] * (5 if include_time else 4) for _ in range(self.window_size) + ] + self.old_action = [[0.0] for _ in range(self.window_size)] + self.timestep = 0 + self.SNES_phase = SNES_phase + self.evaluating = evaluating + + def get_state(self, obs, time): + if self.window_size > 0: + return np.concatenate( + [np.reshape(self.old_state, (-1)), np.reshape(self.old_action, (-1))] + ) + else: + if self.include_time: + return list(obs) + [time / 10] + else: + return obs + + def update_old_state(self, obs, t): + if self.include_time: + self.old_state = self.old_state[1:] + [list(obs) + [t / 10]] + else: + self.old_state = self.old_state[1:] + [list(obs)] + + def update_old_action(self, action): + self.old_action = self.old_action[1:] + [action[0]] + + def get_control_output_(self, x, t=None): + self.timestep += 1 + obs = self.dynamics_func.normalize_state(x) + self.update_old_state(obs, t) + action = self.model.predict(self.get_state(obs, t), deterministic=True) + + # add gaussian noise to action + if not self.evaluating and self.SNES_phase: + a = action[0][0] + a = torch.atanh(torch.tensor(a)) + a += np.random.randn() * 0.1 + a = torch.tanh(a).numpy() + action[0][0] = np.clip(a, a_min=-1, a_max=+1) + + self.update_old_action(action) + return self.dynamics_func.unscale_action(action) diff --git a/src/python/setup.py b/src/python/setup.py index ea4f3177..40a9f200 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -1,4 +1,4 @@ -from setuptools import setup, find_packages +from setuptools import find_packages, setup setup( name="DoublePendulum", @@ -58,6 +58,7 @@ "flax", "tqdm", "cloudpickle==3.0.0", + "evotorch", ], }, classifiers=[