engine.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. import math
  2. import sys
  3. import time
  4. import torch
  5. import torchvision.models.detection.mask_rcnn
  6. from coco_utils import get_coco_api_from_dataset
  7. from coco_eval import CocoEvaluator
  8. import utils
  9. # ### OUR CODE ###
  10. # let's import tensorboard
  11. import tensorboard
  12. # ### END OF OUR CODE ###
  13. def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
  14. model.train()
  15. metric_logger = utils.MetricLogger(delimiter=" ")
  16. metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
  17. header = 'Epoch: [{}]'.format(epoch)
  18. lr_scheduler = None
  19. if epoch == 0:
  20. warmup_factor = 1. / 1000
  21. warmup_iters = min(1000, len(data_loader) - 1)
  22. lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)
  23. for images, targets in metric_logger.log_every(data_loader, print_freq, header):
  24. images = list(image.to(device) for image in images)
  25. targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
  26. loss_dict = model(images, targets)
  27. # applying logging only in the main process
  28. # ### OUR CODE ###
  29. if utils.is_main_process():
  30. # let's track the losses here by adding scalars
  31. tensorboard.logger.add_scalar_dict(
  32. # passing the dictionary of losses (pairs - loss_key: loss_value)
  33. loss_dict,
  34. # passing the global step (number of iterations)
  35. global_step=tensorboard.global_iter,
  36. # adding the tag to combine plots in a subgroup
  37. tag="loss"
  38. )
  39. # incrementing the global step (number of iterations)
  40. tensorboard.global_iter += 1
  41. # ### END OF OUR CODE ###
  42. losses = sum(loss for loss in loss_dict.values())
  43. # reduce losses over all GPUs for logging purposes
  44. loss_dict_reduced = utils.reduce_dict(loss_dict)
  45. losses_reduced = sum(loss for loss in loss_dict_reduced.values())
  46. loss_value = losses_reduced.item()
  47. if not math.isfinite(loss_value):
  48. print("Loss is {}, stopping training".format(loss_value))
  49. print(loss_dict_reduced)
  50. sys.exit(1)
  51. optimizer.zero_grad()
  52. losses.backward()
  53. optimizer.step()
  54. if lr_scheduler is not None:
  55. lr_scheduler.step()
  56. metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
  57. metric_logger.update(lr=optimizer.param_groups[0]["lr"])
  58. return metric_logger
  59. def _get_iou_types(model):
  60. model_without_ddp = model
  61. if isinstance(model, torch.nn.parallel.DistributedDataParallel):
  62. model_without_ddp = model.module
  63. iou_types = ["bbox"]
  64. if isinstance(model_without_ddp, torchvision.models.detection.MaskRCNN):
  65. iou_types.append("segm")
  66. if isinstance(model_without_ddp, torchvision.models.detection.KeypointRCNN):
  67. iou_types.append("keypoints")
  68. return iou_types
  69. @torch.no_grad()
  70. def evaluate(model, data_loader, device):
  71. n_threads = torch.get_num_threads()
  72. # FIXME remove this and make paste_masks_in_image run on the GPU
  73. torch.set_num_threads(1)
  74. cpu_device = torch.device("cpu")
  75. model.eval()
  76. metric_logger = utils.MetricLogger(delimiter=" ")
  77. header = 'Test:'
  78. coco = get_coco_api_from_dataset(data_loader.dataset)
  79. iou_types = _get_iou_types(model)
  80. coco_evaluator = CocoEvaluator(coco, iou_types)
  81. # changing these two lines a bit to have iteration number and to keep image tensor
  82. for i, (images, targets) in enumerate(metric_logger.log_every(data_loader, 100, header)):
  83. img = images[0]
  84. images = list(img.to(device) for img in images)
  85. targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
  86. torch.cuda.synchronize()
  87. model_time = time.time()
  88. outputs = model(images)
  89. # applying logging only in the main process
  90. # ### OUR CODE ###
  91. if utils.is_main_process():
  92. # let's track bounding box and labels predictions for the first 50 images
  93. # as we hardly want to track all validation images
  94. # but want to see how the predicted bounding boxes and labels are changing during the process
  95. if i < 50:
  96. # let's add tracking images with predicted bounding boxes
  97. tensorboard.logger.add_image_with_boxes(
  98. # adding pred_images tag to combine images in one subgroup
  99. "pred_images/PD-{}".format(i),
  100. # passing image tensor
  101. img,
  102. # passing predicted bounding boxes
  103. outputs[0]["boxes"].cpu(),
  104. # mapping & passing predicted labels
  105. labels=[
  106. tensorboard.COCO_INSTANCE_CATEGORY_NAMES[i]
  107. for i in outputs[0]["labels"].cpu().numpy()
  108. ],
  109. )
  110. # ### END OUR CODE ###
  111. outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
  112. model_time = time.time() - model_time
  113. res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
  114. evaluator_time = time.time()
  115. coco_evaluator.update(res)
  116. evaluator_time = time.time() - evaluator_time
  117. metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)
  118. # gather the stats from all processes
  119. metric_logger.synchronize_between_processes()
  120. print("Averaged stats:", metric_logger)
  121. coco_evaluator.synchronize_between_processes()
  122. # accumulate predictions from all images
  123. # add main process check for multi-gpu training (torch.distributed)
  124. if utils.is_main_process():
  125. coco_evaluator.accumulate()
  126. coco_evaluator.summarize()
  127. torch.set_num_threads(n_threads)
  128. return coco_evaluator