We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
I'm using your Python package to adversarially train a ResNet50 network for CelebA dataset. My code looks something like this:
import torch torch.manual_seed(42) import torchvision from torchvision import transforms from torch import nn from robustness.datasets import CelebA from robustness.model_utils import make_and_restore_model from cox.utils import Parameters from cox import store from robustness import model_utils, datasets, train, defaults transform = transforms.Compose([ transforms.RandomHorizontalFlip(p=0.5), transforms.ToTensor(), transforms.RandomErasing() ]) celeba_train = torchvision.datasets.CelebA('./celeba', split='train', target_type='identity',transform=transform) celeba_val = torchvision.datasets.CelebA('./celeba', split='valid', target_type='identity',transform=transform) train_loader = torch.utils.data.DataLoader(celeba_train, batch_size=16, shuffle=True, num_workers=4) val_loader = torch.utils.data.DataLoader(celeba_val, batch_size=16, shuffle=True, num_workers=4) ds = CelebA('./celeba') m, _ = make_and_restore_model(arch='resnet50', pytorch_pretrained=False, dataset=ds) # Create a cox store for logging OUT_DIR = './outputs' out_store = store.Store(OUT_DIR) train_kwargs = { 'out_dir': "train_out", 'adv_train': 1, 'constraint': '2', 'eps': 0.01, 'attack_lr': 0.005, 'attack_steps': 40, 'epochs': 100 } train_args = Parameters(train_kwargs) # Fill whatever parameters are missing from the defaults train_args = defaults.check_and_fill_args(train_args, defaults.TRAINING_ARGS, CelebA) train_args = defaults.check_and_fill_args(train_args, defaults.PGD_ARGS, CelebA) # Train a model train.train_model(train_args, m, (train_loader, val_loader), store=out_store)
My definition for CelebA in robustness/datasets.py is:
class CelebA(DataSet): def __init__(self, data_path,**kwargs): self.num_classes = 10177 ds_kwargs = { 'num_classes': self.num_classes, 'mean': torch.tensor([0.5061, 0.4254, 0.3828]), 'std': torch.tensor([0.2658, 0.2452, 0.2413]), 'custom_class': datasets.CelebA, 'transform_train': da.TRAIN_TRANSFORMS_IMAGENET, 'transform_test': da.TEST_TRANSFORMS_IMAGENET } super(CelebA, self).__init__('celeba', data_path, **ds_kwargs) def get_model(self, arch, pretrained=False): return imagenet_models.__dict__[arch](num_classes=self.num_classes, pretrained=pretrained)
Once I run my code it starts training as expected but then after a while I get this error:
--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) <ipython-input-4-cade5f159ff1> in <module> 18 19 # Train a model ---> 20 train.train_model(train_args, m, (train_loader, val_loader), store=out_store) ~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/robustness/train.py in train_model(args, model, loaders, checkpoint, dp_device_ids, store, update_params, disable_no_grad) 309 for epoch in range(start_epoch, args.epochs): 310 # train for one epoch --> 311 train_prec1, train_loss = _model_loop(args, 'train', train_loader, 312 model, opt, epoch, args.adv_train, writer) 313 last_epoch = (epoch == (args.epochs - 1)) ~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/robustness/train.py in _model_loop(args, loop_type, loader, model, opt, epoch, adv, writer) 445 # measure data loading time 446 target = target.cuda(non_blocking=True) --> 447 output, final_inp = model(inp, target=target, make_adv=adv, 448 **attack_kwargs) 449 loss = train_criterion(output, target) ~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) 887 result = self._slow_forward(*input, **kwargs) 888 else: --> 889 result = self.forward(*input, **kwargs) 890 for hook in itertools.chain( 891 _global_forward_hooks.values(), ~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs) 163 164 if len(self.device_ids) == 1: --> 165 return self.module(*inputs[0], **kwargs[0]) 166 replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) 167 outputs = self.parallel_apply(replicas, inputs, kwargs) ~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) 887 result = self._slow_forward(*input, **kwargs) 888 else: --> 889 result = self.forward(*input, **kwargs) 890 for hook in itertools.chain( 891 _global_forward_hooks.values(), ~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/robustness/attacker.py in forward(self, inp, target, make_adv, with_latent, fake_relu, no_relu, with_image, **attacker_kwargs) 311 prev_training = bool(self.training) 312 self.eval() --> 313 adv = self.attacker(inp, target, **attacker_kwargs) 314 if prev_training: 315 self.train() ~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) 887 result = self._slow_forward(*input, **kwargs) 888 else: --> 889 result = self.forward(*input, **kwargs) 890 for hook in itertools.chain( 891 _global_forward_hooks.values(), ~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/robustness/attacker.py in forward(self, x, target, constraint, eps, step_size, iterations, random_start, random_restarts, do_tqdm, targeted, custom_loss, should_normalize, orig_input, use_best, return_image, est_grad, mixed_precision, *_) 247 adv_ret = to_ret 248 else: --> 249 adv_ret = get_adv_examples(x) 250 251 return adv_ret ~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/robustness/attacker.py in get_adv_examples(x) 202 x.grad.zero_() 203 elif (est_grad is None): --> 204 grad, = ch.autograd.grad(m * loss, [x]) 205 else: 206 f = lambda _x, _y: m * calc_loss(step.to_image(_x), _y)[0] ~/anaconda3/envs/pytorch-flash/lib/python3.9/site-packages/torch/autograd/__init__.py in grad(outputs, inputs, grad_outputs, retain_graph, create_graph, only_inputs, allow_unused) 221 retain_graph = create_graph 222 --> 223 return Variable._execution_engine.run_backward( 224 outputs, grad_outputs_, retain_graph, create_graph, 225 inputs, allow_unused, accumulate_grad=False) RuntimeError: CUDA error: device-side assert triggered
In addition, when I try to change the batch_size I get the error:
RuntimeError: cuDNN error: CUDNN_STATUS_MAPPING_ERROR
Any insight on why is this happening and how to solve it?
The text was updated successfully, but these errors were encountered:
No branches or pull requests
I'm using your Python package to adversarially train a ResNet50 network for CelebA dataset. My code looks something like this:
My definition for CelebA in robustness/datasets.py is:
Once I run my code it starts training as expected but then after a while I get this error:
In addition, when I try to change the batch_size I get the error:
Any insight on why is this happening and how to solve it?
The text was updated successfully, but these errors were encountered: