diff --git a/train_rpnet.py b/train_rpnet.py index 2251e97..0c4be1c 100644 --- a/train_rpnet.py +++ b/train_rpnet.py @@ -67,6 +67,9 @@ def adjust_learning_rate(lr, warmup_epoch, optimizer, epoch: int, step: int, len def train(train_root, val_root, batch_size, output, device, wr2_pretrained): + if RANK in {-1, 0} and not os.path.exists(output): + os.makedirs(output) + LOGGER.info("=> Create Model") # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = RPNet(device=device, wr2_pretrained=wr2_pretrained).to(device) @@ -97,6 +100,8 @@ def train(train_root, val_root, batch_size, output, device, wr2_pretrained): LOGGER.info("=> Start training") t0 = time.time() + amp = True + scaler = torch.cuda.amp.GradScaler(enabled=amp) # DDP mode cuda = device.type != 'cpu' @@ -120,17 +125,16 @@ def train(train_root, val_root, batch_size, output, device, wr2_pretrained): images = images.to(device) targets = targets.to(device) - outputs = model(images) - - loss = criterion(outputs, targets) - # if RANK != -1: - # loss *= WORLD_SIZE # gradient averaged between devices in DDP mode - loss.backward() + with torch.cuda.amp.autocast(amp): + outputs = model(images) + loss = criterion(outputs, targets) + scaler.scale(loss).backward() if epoch <= warmup_epoch: adjust_learning_rate(learn_rate, warmup_epoch, optimizer, epoch - 1, idx, len(train_dataloader)) - optimizer.step() + scaler.step(optimizer) # optimizer.step + scaler.update() optimizer.zero_grad() if RANK in {-1, 0}: @@ -157,14 +161,11 @@ def train(train_root, val_root, batch_size, output, device, wr2_pretrained): ap, acc = ccpd_evaluator.result() LOGGER.info(f"AP:{ap * 100:.3f} ACC: {acc * 100:.3f}") scheduler.step() + torch.cuda.empty_cache() LOGGER.info(f'\n{epochs} epochs completed in {(time.time() - t0) / 3600:.3f} hours.') def main(opt): - output = opt.output - if not os.path.exists(output): - os.makedirs(output) - # DDP mode device = select_device(opt.device, batch_size=opt.batch_size) if LOCAL_RANK != -1: @@ -176,10 +177,9 @@ def main(opt): device = torch.device('cuda', LOCAL_RANK) dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo") - # init_seeds(opt.seed + 1 + RANK, deterministic=True) init_seeds(opt.seed + 1 + RANK, deterministic=False) # LOGGER.info(f"LOCAL_RANK: {LOCAL_RANK} RANK: {RANK} WORLD_SIZE: {WORLD_SIZE}") - train(opt.train_root, opt.val_root, opt.batch_size, output, device, opt.wr2_pretrained) + train(opt.train_root, opt.val_root, opt.batch_size, opt.output, device, opt.wr2_pretrained) if __name__ == '__main__': diff --git a/train_wr2.py b/train_wr2.py index 99d5d18..63457b0 100644 --- a/train_wr2.py +++ b/train_wr2.py @@ -66,6 +66,9 @@ def adjust_learning_rate(lr, warmup_epoch, optimizer, epoch: int, step: int, len def train(train_root, val_root, batch_size, output, device): + if RANK in {-1, 0} and not os.path.exists(output): + os.makedirs(output) + LOGGER.info("=> Create Model") # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = wR2(num_classes=4).to(device) @@ -96,6 +99,8 @@ def train(train_root, val_root, batch_size, output, device): LOGGER.info("=> Start training") t0 = time.time() + amp = True + scaler = torch.cuda.amp.GradScaler(enabled=amp) # DDP mode cuda = device.type != 'cpu' @@ -119,17 +124,16 @@ def train(train_root, val_root, batch_size, output, device): images = images.to(device) targets = targets.to(device) - outputs = model(images) - - loss = criterion(outputs, targets) - # if RANK != -1: - # loss *= WORLD_SIZE # gradient averaged between devices in DDP mode - loss.backward() + with torch.cuda.amp.autocast(amp): + outputs = model(images) + loss = criterion(outputs, targets) + scaler.scale(loss).backward() if epoch <= warmup_epoch: adjust_learning_rate(learn_rate, warmup_epoch, optimizer, epoch - 1, idx, len(train_dataloader)) - optimizer.step() + scaler.step(optimizer) # optimizer.step + scaler.update() optimizer.zero_grad() if RANK in {-1, 0}: @@ -156,14 +160,11 @@ def train(train_root, val_root, batch_size, output, device): ap, _ = ccpd_evaluator.result() LOGGER.info(f"AP: {ap * 100:.3f}") scheduler.step() + torch.cuda.empty_cache() LOGGER.info(f'\n{epochs} epochs completed in {(time.time() - t0) / 3600:.3f} hours.') def main(opt): - output = opt.output - if not os.path.exists(output): - os.makedirs(output) - # DDP mode device = select_device(opt.device, batch_size=opt.batch_size) if LOCAL_RANK != -1: @@ -177,7 +178,7 @@ def main(opt): init_seeds(opt.seed + 1 + RANK, deterministic=True) # LOGGER.info(f"LOCAL_RANK: {LOCAL_RANK} RANK: {RANK} WORLD_SIZE: {WORLD_SIZE}") - train(opt.train_root, opt.val_root, opt.batch_size, output, device) + train(opt.train_root, opt.val_root, opt.batch_size, opt.output, device) if __name__ == '__main__':