Skip to content

Commit

Permalink
Merge pull request #32 from kaseris/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
kaseris committed Nov 30, 2023
2 parents e5d24f6 + a4a2410 commit 707732e
Show file tree
Hide file tree
Showing 13 changed files with 305 additions and 28 deletions.
42 changes: 42 additions & 0 deletions configs/lstm_regressor_1024x1024.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
dataset:
name: 'NTURGBDDataset'
args:
missing_files_dir: 'data/missing'
label_file: 'data/labels.txt'
max_context_window: 10
max_number_of_bodies: 1
transforms:
name: 'MinMaxScaleTransform'
args:
feature_scale: [0.0, 1.0]
max_duration: 300
n_joints: 25

# Set the train data percentage
train_data_percentage: 0.8

model:
name: 'SimpleLSTMRegressor'
args:
hidden_size: 1024
num_layers: 2
linear_out: 1024
reduction: 'mean'
batch_first: true
n_joints: 25
n_dims: 3

runner:
args:
val_batch_size: 32
train_batch_size: 32
block_size: 8
device: 'cuda'
logger:
name: 'TensorboardLogger'
args:
save_dir: 'runs'
checkpoint_dir: '/home/kaseris/Documents/checkpoints_forecasting'
n_epochs: 10
lr: 0.00001
log_gradient_info: true
Empty file added src/skelcast/core/__init__.py
Empty file.
144 changes: 144 additions & 0 deletions src/skelcast/core/environment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import os
import logging

import randomname
import yaml

import torch
import torch.optim as optim
from torch.utils.data import random_split

from skelcast.models import MODELS
from skelcast.data import DATASETS
from skelcast.data import TRANSFORMS
from skelcast.logger import LOGGERS

from skelcast.experiments.runner import Runner

torch.manual_seed(133742069)

class Environment:
"""
The Environment class is designed to set up and manage the environment for training machine learning models.
It includes methods for building models, datasets, loggers, and runners based on specified configurations.
Attributes:
_experiment_name (str): A randomly generated name for the experiment.
checkpoint_dir (str): Directory path for storing model checkpoints.
data_dir (str): Directory path where the dataset is located.
config (dict, optional): Configuration settings for the model, dataset, logger, and runner.
_model (object, optional): The instantiated machine learning model.
_dataset (object, optional): The complete dataset.
_train_dataset (object, optional): The training subset of the dataset.
_val_dataset (object, optional): The validation subset of the dataset.
_runner (object, optional): The training runner.
_logger (object, optional): The logger for recording experiment results.
Methods:
experiment_name: Property that returns the experiment name.
build_from_file(config_path): Parses the configuration file and builds the dataset, model, logger, and runner.
run(): Starts the training process, either from scratch or by resuming from the latest checkpoint.
Usage:
1. Initialize the Environment with data and checkpoint directories.
2. Call `build_from_file` with the path to a configuration file.
3. Use `run` to start the training process.
Note:
This class is highly dependent on external modules and configurations. Ensure that all required modules
and configurations are properly set up before using this class.
"""
def __init__(self, data_dir: str = '/home/kaseris/Documents/data_ntu_rbgd',
checkpoint_dir = '/home/kaseris/Documents/checkpoints_forecasting') -> None:
self._experiment_name = randomname.get_name()
self.checkpoint_dir = checkpoint_dir
self.data_dir = data_dir
self.config = None
self._model = None
self._dataset = None
self._train_dataset = None
self._val_dataset = None
self._runner = None
self._logger = None

@property
def experiment_name(self) -> str:
return self._experiment_name

def build_from_file(self, config_path: str) -> None:
config = self._parse_file(config_path)
self.config = config
logging.log(logging.INFO, f'Building environment from {config_path}.')
self._build_dataset()
self._build_model()
self._build_logger()
self._build_runner()

def _build_model(self) -> None:
logging.log(logging.INFO, 'Building model.')
model_config = self.config['model']
_name = model_config.get('name')
_args = model_config.get('args')
self._model = MODELS.get_module(_name)(**_args)
logging.log(logging.INFO, f'Model creation complete.')

def _build_dataset(self) -> None:
logging.log(logging.INFO, 'Building dataset.')
dataset_config = self.config['dataset']
_name = dataset_config.get('name')
_args = dataset_config.get('args')
_transforms_cfg = dataset_config.get('args').get('transforms')
_transforms = TRANSFORMS.get_module(_transforms_cfg.get('name'))(**_transforms_cfg.get('args'))
_args['transforms'] = _transforms
self._dataset = DATASETS.get_module(_name)(self.data_dir, **_args)
# Split the dataset
_train_len = int(self.config['train_data_percentage'] * len(self._dataset))
self._train_dataset, self._val_dataset = random_split(self._dataset, [_train_len, len(self._dataset) - _train_len])
logging.log(logging.INFO, f'Train set size: {len(self._train_dataset)}')

def _build_logger(self) -> None:
logging.log(logging.INFO, 'Building logger.')
logger_config = self.config['runner']['args'].get('logger')
logdir = os.path.join(logger_config['args']['save_dir'], self.experiment_name)
self._logger = LOGGERS.get_module(logger_config['name'])(logdir)
logging.log(logging.INFO, f'Logging to {logdir}.')

def _build_runner(self) -> None:
logging.log(logging.INFO, 'Building runner.')
runner_config = self.config['runner']
_args = runner_config.get('args')
_args['logger'] = self._logger
_args['optimizer'] = optim.AdamW(self._model.parameters(), lr=_args.get('lr'))
_args['train_set'] = self._train_dataset
_args['val_set'] = self._val_dataset
_args['model'] = self._model
_args['train_set'] = self._train_dataset
_args['val_set'] = self._val_dataset
_args['checkpoint_dir'] = os.path.join(self.checkpoint_dir, self._experiment_name)
self._runner = Runner(**_args)
self._runner.setup()
logging.log(logging.INFO, 'Runner setup complete.')

def _create_checkpoint_dir(self) -> None:
if os.path.exists(os.path.join(self.checkpoint_dir, self._experiment_name)):
raise ValueError(f'Checkpoint directory {self.checkpoint_dir} already exists.')
else:
logging.log(logging.INFO, f'Creating checkpoint directory: {self.checkpoint_dir}.')
os.mkdir(os.path.join(self.checkpoint_dir, self._experiment_name))

def _parse_file(self, fname: str) -> None:
with open(fname, 'r') as f:
config = yaml.safe_load(f)
return config

def run(self) -> None:
# Must check if there is a checkpoint directory
# If there is, load the latest checkpoint and continue training
# Else, create a new checkpoint directory and start training
# If there's not a checkpoint directory, use the self._runner.fit() method
# Otherwise, use the self._runner.resume(path_to_checkpoint) method
if not os.path.exists(os.path.join(self.checkpoint_dir, self._experiment_name)):
self._create_checkpoint_dir()
return self._runner.fit()
else:
return self._runner.resume(os.path.join(self.checkpoint_dir, self._experiment_name))
41 changes: 41 additions & 0 deletions src/skelcast/core/registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
class Registry:
def __init__(self):
self._module_dict = dict()

def register_module(self, cls=None, module_name=None):
"""
A decorator to register a module.
Args:
- cls (class, optional): The class to be registered.
- module_name (str, optional): The name under which the class will be registered.
Defaults to the class name if not provided.
"""

def _register(cls):
nonlocal module_name
if module_name is None:
module_name = cls.__name__
if module_name in self._module_dict:
raise KeyError(f"{module_name} is already registered in {self.__class__.__name__}")
self._module_dict[module_name] = cls
return cls

if cls is not None:
return _register(cls)
else:
return _register

def get_module(self, module_name):
"""
Retrieves a class by its registered name.
Args:
- module_name (str): The name of the module to retrieve.
"""
if module_name not in self._module_dict:
raise KeyError(f"{module_name} is not registered in {self.__class__.__name__}")
return self._module_dict[module_name]

def __str__(self):
return str(self._module_dict)
8 changes: 8 additions & 0 deletions src/skelcast/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from skelcast.core.registry import Registry

DATASETS = Registry()
COLLATE_FUNCS = Registry()
TRANSFORMS = Registry()

from .dataset import NTURGBDCollateFn, NTURGBDDataset
from .transforms import MinMaxScaleTransform
3 changes: 3 additions & 0 deletions src/skelcast/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
should_blacklist,
)

from skelcast.data import DATASETS, COLLATE_FUNCS

def read_skeleton_file(
file_path, save_skelxyz=True, save_rgbxy=True, save_depthxy=True
Expand Down Expand Up @@ -102,6 +103,7 @@ def nturbgd_collate_fn_with_overlapping_context_window(batch: List[NTURGBDSample
return NTURGBDSample(x=batch_x, y=batch_y, label=batch_label)


@COLLATE_FUNCS.register_module()
class NTURGBDCollateFn:
"""
Custom collate function for batched variable-length sequences.
Expand Down Expand Up @@ -159,6 +161,7 @@ def get_windows(self, x):
return input_windows_tensor, target_labels_tensor


@DATASETS.register_module()
class NTURGBDDataset(Dataset):
def __init__(
self,
Expand Down
14 changes: 13 additions & 1 deletion src/skelcast/data/transforms.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import torch
from typing import Tuple

from skelcast.data import TRANSFORMS


@TRANSFORMS.register_module()
class MinMaxScaleTransform:

def __init__(self, feature_scale: Tuple[float, float]) -> None:
assert isinstance(feature_scale, tuple), '`feature_scale` must be a tuple.'
assert isinstance(feature_scale, tuple) or isinstance(feature_scale, list), '`feature_scale` must be a tuple.'
self.min_, self.max_ = feature_scale

def __call__(self, x: torch.Tensor) -> torch.Tensor:
Expand All @@ -25,3 +29,11 @@ def __call__(self, x: torch.Tensor) -> torch.Tensor:
x[..., axis] = (x[..., axis] - min_vals[axis]) * scale[axis] + self.min_

return x

@property
def min(self) -> float:
return self.min_

@property
def max(self) -> float:
return self.max_
59 changes: 33 additions & 26 deletions src/skelcast/experiments/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,40 +19,45 @@ class Runner:
It uses datasets for training and validation, and includes functionality for batch processing,
gradient logging, and checkpoint management.
Attributes:
train_set (Dataset): The dataset for training.
val_set (Dataset): The dataset for validation.
train_batch_size (int): Batch size for the training dataset.
val_batch_size (int): Batch size for the validation dataset.
block_size (int): Block size used for collating batch data.
model (SkelcastModule): The model to be trained and validated.
optimizer (torch.optim.Optimizer): Optimizer for model training.
n_epochs (int): Number of epochs to train the model.
device (str): The device ('cpu' or 'cuda') on which to run the model.
checkpoint_dir (str): Directory to save checkpoints.
checkpoint_frequency (int): Frequency (in epochs) at which to save checkpoints.
logger (BaseLogger): Logger for recording training and validation metrics.
log_gradient_info (bool): Flag to determine if gradient information is logged.
Args:
---
- `train_set` (Dataset): The dataset for training.
- `val_set` (Dataset): The dataset for validation.
- `train_batch_size` (int): Batch size for the training dataset.
- `val_batch_size` (int): Batch size for the validation dataset.
- `block_size` (int): Block size used for collating batch data.
- `model` (SkelcastModule): The model to be trained and validated.
- `optimizer` (torch.optim.Optimizer): Optimizer for model training.
- `n_epochs` (int): Number of epochs to train the model.
- `device` (str): The device ('cpu' or 'cuda') on which to run the model.
- `checkpoint_dir` (str): Directory to save checkpoints.
- `checkpoint_frequency` (int): Frequency (in epochs) at which to save checkpoints.
- `logger` (BaseLogger): Logger for recording training and validation metrics.
- `log_gradient_info` (bool): Flag to determine if gradient information is logged.
Methods:
setup(): Prepares the runner for training and validation.
fit(): Starts the training process from epoch 0.
resume(checkpoint_path): Resumes training from a saved checkpoint.
training_step(train_batch): Executes a single training step.
validation_step(val_batch): Executes a single validation step.
_run_epochs(start_epoch): Runs training and validation for specified epochs.
_run_phase(phase, epoch): Runs a training or validation phase for a single epoch.
_log_epoch_loss(phase, epoch): Logs the loss for a completed epoch.
_restore_state(checkpoint): Restores the state of the model and optimizer from a checkpoint.
_compile_results(): Compiles and returns training and validation results.
---
- `setup()`: Prepares the runner for training and validation.
- `fit()`: Starts the training process from epoch 0.
- `resume(checkpoint_path)`: Resumes training from a saved checkpoint.
- `training_step(train_batch)`: Executes a single training step.
- `validation_step(val_batch)`: Executes a single validation step.
- `_run_epochs(start_epoch)`: Runs training and validation for specified epochs.
- `_run_phase(phase, epoch)`: Runs a training or validation phase for a single epoch.
- `_log_epoch_loss(phase, epoch)`: Logs the loss for a completed epoch.
- `_restore_state(checkpoint)`: Restores the state of the model and optimizer from a checkpoint.
- `_compile_results()`: Compiles and returns training and validation results.
Note:
---
- This class requires a properly formatted SkelcastModule model and corresponding datasets.
- The checkpoint directory must exist before initializing the Runner.
- Logging and checkpointing are optional and can be configured as needed.
Raises:
AssertionError: If the checkpoint directory does not exist.
---
`AssertionError`: If the checkpoint directory does not exist.
"""
def __init__(self,
train_set: Dataset,
Expand All @@ -62,6 +67,7 @@ def __init__(self,
block_size: int,
model: SkelcastModule,
optimizer: torch.optim.Optimizer = None,
lr: float = 1e-4,
n_epochs: int = 10,
device: str = 'cpu',
checkpoint_dir: str = None,
Expand All @@ -77,11 +83,12 @@ def __init__(self,
self.train_loader = DataLoader(dataset=self.train_set, batch_size=self.train_batch_size, shuffle=True, collate_fn=self._collate_fn)
self.val_loader = DataLoader(dataset=self.val_set, batch_size=self.val_batch_size, shuffle=False, collate_fn=self._collate_fn)
self.model = model
self.lr = lr

if optimizer is not None:
self.optimizer = optimizer
else:
self.optimizer = optim.AdamW(self.model.parameters(), lr=1e-5)
self.optimizer = optim.AdamW(self.model.parameters(), lr=lr)

self.training_loss_history = []
self.training_loss_per_step = []
Expand Down
5 changes: 5 additions & 0 deletions src/skelcast/logger/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from skelcast.core.registry import Registry

LOGGERS = Registry()

from .tensorboard_logger import TensorboardLogger
Loading

0 comments on commit 707732e

Please sign in to comment.