123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218 |
- r"""PyTorch Detection Training.
- To run in a multi-gpu environment, use the distributed launcher::
- python -m torch.distributed.launch --nproc_per_node=$NGPU --use_env \
- train.py ... --world-size $NGPU
- The default hyperparameters are tuned for training on 8 gpus and 2 images per gpu.
- --lr 0.02 --batch-size 2 --world-size 8
- If you use different number of gpus, the learning rate should be changed to 0.02/8*$NGPU.
- On top of that, for training Faster/Mask R-CNN, the default hyperparameters are
- --epochs 26 --lr-steps 16 22 --aspect-ratio-group-factor 3
- Also, if you train Keypoint R-CNN, the default hyperparameters are
- --epochs 46 --lr-steps 36 43 --aspect-ratio-group-factor 3
- Because the number of images is smaller in the person keypoint subset of COCO,
- the number of epochs should be adapted so that we have the same number of iterations.
- """
- import datetime
- import os
- import time
- import torch
- import torch.utils.data
- from torch import nn
- import torchvision
- import torchvision.models.detection
- import torchvision.models.detection.mask_rcnn
- from coco_utils import get_coco, get_coco_kp
- from group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups
- from engine import train_one_epoch, evaluate
- import utils
- import transforms as T
- # ### OUR CODE
- # import tensorboard and w&b
- import tensorboard
- import wandb
- # ### END OF OUR CODE
- def get_dataset(name, image_set, transform, data_path):
- paths = {
- "coco": (data_path, get_coco, 91),
- "coco_kp": (data_path, get_coco_kp, 2)
- }
- p, ds_fn, num_classes = paths[name]
- ds = ds_fn(p, image_set=image_set, transforms=transform)
- return ds, num_classes
- def get_transform(train):
- transforms = []
- transforms.append(T.ToTensor())
- if train:
- transforms.append(T.RandomHorizontalFlip(0.5))
- return T.Compose(transforms)
- def main(args):
- utils.init_distributed_mode(args)
- print(args)
- # applying logging only in the main process
- # ### OUR CODE ###
- if utils.is_main_process():
- # passing argparse config with hyperparameters
- tensorboard.args = vars(args)
- # init wandb using config and experiment name
- wandb.init(config=vars(args), name=tensorboard.name)
- # enable tensorboard sync
- wandb.init(sync_tensorboard=True)
- # ### END OF OUR CODE ###
- device = torch.device(args.device)
- # Data loading code
- print("Loading data")
- dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True), args.data_path)
- dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path)
- print("Creating data loaders")
- if args.distributed:
- train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
- test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test)
- else:
- train_sampler = torch.utils.data.RandomSampler(dataset)
- test_sampler = torch.utils.data.SequentialSampler(dataset_test)
- if args.aspect_ratio_group_factor >= 0:
- group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor)
- train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
- else:
- train_batch_sampler = torch.utils.data.BatchSampler(
- train_sampler, args.batch_size, drop_last=True)
- data_loader = torch.utils.data.DataLoader(
- dataset, batch_sampler=train_batch_sampler, num_workers=args.workers,
- collate_fn=utils.collate_fn)
- data_loader_test = torch.utils.data.DataLoader(
- dataset_test, batch_size=1,
- sampler=test_sampler, num_workers=args.workers,
- collate_fn=utils.collate_fn)
- print("Creating model")
- model = torchvision.models.detection.__dict__[args.model](num_classes=num_classes,
- pretrained=args.pretrained)
- model.to(device)
- model_without_ddp = model
- if args.distributed:
- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
- model_without_ddp = model.module
- params = [p for p in model.parameters() if p.requires_grad]
- optimizer = torch.optim.SGD(
- params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
- # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
- lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma)
- if args.resume:
- checkpoint = torch.load(args.resume, map_location='cpu')
- model_without_ddp.load_state_dict(checkpoint['model'])
- optimizer.load_state_dict(checkpoint['optimizer'])
- lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
- args.start_epoch = checkpoint['epoch'] + 1
- if args.test_only:
- evaluate(model, data_loader_test, device=device)
- return
- print("Start training")
- start_time = time.time()
- for epoch in range(args.start_epoch, args.epochs):
- if args.distributed:
- train_sampler.set_epoch(epoch)
- train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq)
- lr_scheduler.step()
- if args.output_dir:
- utils.save_on_master({
- 'model': model_without_ddp.state_dict(),
- 'optimizer': optimizer.state_dict(),
- 'lr_scheduler': lr_scheduler.state_dict(),
- 'args': args,
- 'epoch': epoch},
- os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
- # evaluate after every epoch
- evaluate(model, data_loader_test, device=device)
- total_time = time.time() - start_time
- total_time_str = str(datetime.timedelta(seconds=int(total_time)))
- print('Training time {}'.format(total_time_str))
- if __name__ == "__main__":
- import argparse
- parser = argparse.ArgumentParser(
- description=__doc__)
- parser.add_argument('--data-path', default='/datasets01/COCO/022719/', help='dataset')
- parser.add_argument('--dataset', default='coco', help='dataset')
- parser.add_argument('--model', default='maskrcnn_resnet50_fpn', help='model')
- parser.add_argument('--device', default='cuda', help='device')
- parser.add_argument('-b', '--batch-size', default=2, type=int,
- help='images per gpu, the total batch size is $NGPU x batch_size')
- parser.add_argument('--epochs', default=26, type=int, metavar='N',
- help='number of total epochs to run')
- parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
- help='number of data loading workers (default: 4)')
- parser.add_argument('--lr', default=0.02, type=float,
- help='initial learning rate, 0.02 is the default value for training '
- 'on 8 gpus and 2 images_per_gpu')
- parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
- help='momentum')
- parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
- metavar='W', help='weight decay (default: 1e-4)',
- dest='weight_decay')
- parser.add_argument('--lr-step-size', default=8, type=int, help='decrease lr every step-size epochs')
- parser.add_argument('--lr-steps', default=[16, 22], nargs='+', type=int, help='decrease lr every step-size epochs')
- parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma')
- parser.add_argument('--print-freq', default=20, type=int, help='print frequency')
- parser.add_argument('--output-dir', default='.', help='path where to save')
- parser.add_argument('--resume', default='', help='resume from checkpoint')
- parser.add_argument('--start_epoch', default=0, type=int, help='start epoch')
- parser.add_argument('--aspect-ratio-group-factor', default=3, type=int)
- parser.add_argument(
- "--test-only",
- dest="test_only",
- help="Only test the model",
- action="store_true",
- )
- parser.add_argument(
- "--pretrained",
- dest="pretrained",
- help="Use pre-trained models from the modelzoo",
- action="store_true",
- )
- # distributed training parameters
- parser.add_argument('--world-size', default=1, type=int,
- help='number of distributed processes')
- parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
- args = parser.parse_args()
- if args.output_dir:
- utils.mkdir(args.output_dir)
- main(args)
|