Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Training on Custom Dataset triggers error : CUDA error: device-side assert triggered #97

Open
elcronos opened this issue Apr 30, 2021 · 0 comments

Comments

@elcronos
Copy link

elcronos commented Apr 30, 2021

I'm using your Python package to adversarially train a ResNet50 network for CelebA dataset. My code looks something like this:

import torch
torch.manual_seed(42)
import torchvision
from torchvision import transforms
from torch import nn
from robustness.datasets import CelebA
from robustness.model_utils import make_and_restore_model
from cox.utils import Parameters
from cox import store
from robustness import model_utils, datasets, train, defaults

transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.RandomErasing()
])

celeba_train = torchvision.datasets.CelebA('./celeba', split='train', target_type='identity',transform=transform)
celeba_val = torchvision.datasets.CelebA('./celeba', split='valid', target_type='identity',transform=transform)

train_loader = torch.utils.data.DataLoader(celeba_train, batch_size=16, shuffle=True, num_workers=4)
val_loader = torch.utils.data.DataLoader(celeba_val, batch_size=16, shuffle=True, num_workers=4)

ds = CelebA('./celeba')
m, _ = make_and_restore_model(arch='resnet50', pytorch_pretrained=False,  dataset=ds)
# Create a cox store for logging
OUT_DIR = './outputs'
out_store = store.Store(OUT_DIR)

train_kwargs = {
    'out_dir': "train_out",
    'adv_train': 1,
    'constraint': '2',
    'eps': 0.01,
    'attack_lr': 0.005,
    'attack_steps': 40,
    'epochs': 100
}

train_args = Parameters(train_kwargs)

# Fill whatever parameters are missing from the defaults
train_args = defaults.check_and_fill_args(train_args,
                        defaults.TRAINING_ARGS, CelebA)
train_args = defaults.check_and_fill_args(train_args,
                        defaults.PGD_ARGS, CelebA)

# Train a model
train.train_model(train_args, m, (train_loader, val_loader), store=out_store)

My definition for CelebA in robustness/datasets.py is:

class CelebA(DataSet):
    def __init__(self, data_path,**kwargs):
        self.num_classes = 10177

        ds_kwargs = {
            'num_classes': self.num_classes,
            'mean': torch.tensor([0.5061, 0.4254, 0.3828]),
            'std': torch.tensor([0.2658, 0.2452, 0.2413]),
            'custom_class': datasets.CelebA,
            'transform_train': da.TRAIN_TRANSFORMS_IMAGENET,
            'transform_test': da.TEST_TRANSFORMS_IMAGENET
        }
        super(CelebA, self).__init__('celeba', data_path, **ds_kwargs)

    def get_model(self, arch, pretrained=False):

        return imagenet_models.__dict__[arch](num_classes=self.num_classes,
                                        pretrained=pretrained)

Once I run my code it starts training as expected but then after a while I get this error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-4-cade5f159ff1> in <module>
     18 
     19 # Train a model
---> 20 train.train_model(train_args, m, (train_loader, val_loader), store=out_store)

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/robustness/train.py in train_model(args, model, loaders, checkpoint, dp_device_ids, store, update_params, disable_no_grad)
    309     for epoch in range(start_epoch, args.epochs):
    310         # train for one epoch
--> 311         train_prec1, train_loss = _model_loop(args, 'train', train_loader, 
    312                 model, opt, epoch, args.adv_train, writer)
    313         last_epoch = (epoch == (args.epochs - 1))

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/robustness/train.py in _model_loop(args, loop_type, loader, model, opt, epoch, adv, writer)
    445        # measure data loading time
    446         target = target.cuda(non_blocking=True)
--> 447         output, final_inp = model(inp, target=target, make_adv=adv,
    448                                   **attack_kwargs)
    449         loss = train_criterion(output, target)

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
    163 
    164         if len(self.device_ids) == 1:
--> 165             return self.module(*inputs[0], **kwargs[0])
    166         replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
    167         outputs = self.parallel_apply(replicas, inputs, kwargs)

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/robustness/attacker.py in forward(self, inp, target, make_adv, with_latent, fake_relu, no_relu, with_image, **attacker_kwargs)
    311             prev_training = bool(self.training)
    312             self.eval()
--> 313             adv = self.attacker(inp, target, **attacker_kwargs)
    314             if prev_training:
    315                 self.train()

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/robustness/attacker.py in forward(self, x, target, constraint, eps, step_size, iterations, random_start, random_restarts, do_tqdm, targeted, custom_loss, should_normalize, orig_input, use_best, return_image, est_grad, mixed_precision, *_)
    247             adv_ret = to_ret
    248         else:
--> 249             adv_ret = get_adv_examples(x)
    250 
    251         return adv_ret

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/robustness/attacker.py in get_adv_examples(x)
    202                         x.grad.zero_()
    203                     elif (est_grad is None):
--> 204                         grad, = ch.autograd.grad(m * loss, [x])
    205                     else:
    206                         f = lambda _x, _y: m * calc_loss(step.to_image(_x), _y)[0]

~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/torch/autograd/__init__.py in grad(outputs, inputs, grad_outputs, retain_graph, create_graph, only_inputs, allow_unused)
    221         retain_graph = create_graph
    222 
--> 223     return Variable._execution_engine.run_backward(
    224         outputs, grad_outputs_, retain_graph, create_graph,
    225         inputs, allow_unused, accumulate_grad=False)

RuntimeError: CUDA error: device-side assert triggered

In addition, when I try to change the batch_size I get the error:

RuntimeError: cuDNN error: CUDNN_STATUS_MAPPING_ERROR

Any insight on why is this happening and how to solve it?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant