From a3491d289373367036aa90dfdad0e96ec1767086 Mon Sep 17 00:00:00 2001 From: Vasileios Vryniotis Date: Fri, 16 Oct 2020 19:35:20 +0100 Subject: [PATCH 1/8] Vectorize operations, across all feaure levels. --- .gitignore | 1 + ...ter.test_retinanet_resnet50_fpn_expect.pkl | Bin 1357 -> 1357 bytes torchvision/models/detection/retinanet.py | 84 ++++++------------ 3 files changed, 30 insertions(+), 55 deletions(-) diff --git a/.gitignore b/.gitignore index 4ed0749da06..c332dda4f4a 100644 --- a/.gitignore +++ b/.gitignore @@ -22,5 +22,6 @@ htmlcov gen.yml .mypy_cache .vscode/ +.idea/ *.orig *-checkpoint.ipynb \ No newline at end of file diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl index 5b5079f20f882e18dccadc790c6cab0c5cb88cb2..59cae788ac44a4766df5bd387a1daeb6947d2381 100644 GIT binary patch delta 411 zcmX@hb(U+wG9Duf3nLQ)V+$h#OS6faQek{^b2AHr$;ynuFaZlA3q!NXO^jMBb5&m# zPu{?2W(?H|R_f-&#SjGtOeUc=g3O^d@&7|?0=(Hd8qFO3@-Q$k@JzO4lApYTNk{r9%JlTTD%m}U%=pHvG zCZMe>K$AfLMl(UU3?OZ-@|R;KD=;@fT)m9hiUs7B$$BhelYcRb1S5MPz?%`QlY;}I zA7rBekO{&vP$MCf3{YKSQF2CRS*m_YVo9RDp@{+9S(6i4gsni|-F7P3}1CoYC4I51dJ*EE0_DhX8Lzut^*o5Od%z z0CHuZMnfnEpt{7OV04fGyRK0WX=>>)HI+pnW D!F^t^ diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py index 770c5dcb1b0..32ecf687d1b 100644 --- a/torchvision/models/detection/retinanet.py +++ b/torchvision/models/detection/retinanet.py @@ -410,74 +410,48 @@ def compute_loss(self, targets, head_outputs, anchors): def postprocess_detections(self, head_outputs, anchors, image_shapes): # type: (Dict[str, Tensor], List[Tensor], List[Tuple[int, int]]) -> List[Dict[str, Tensor]] # TODO: Merge this with roi_heads.RoIHeads.postprocess_detections ? - class_logits = head_outputs.pop('cls_logits') box_regression = head_outputs.pop('bbox_regression') other_outputs = head_outputs - device = class_logits.device num_classes = class_logits.shape[-1] scores = torch.sigmoid(class_logits) - # create labels for each score - labels = torch.arange(num_classes, device=device) - labels = labels.view(1, -1).expand_as(scores) - detections = torch.jit.annotate(List[Dict[str, Tensor]], []) - for index, (box_regression_per_image, scores_per_image, labels_per_image, anchors_per_image, image_shape) in \ - enumerate(zip(box_regression, scores, labels, anchors, image_shapes)): + for index, (box_regression_per_image, scores_per_image, anchors_per_image, image_shape) in \ + enumerate(zip(box_regression, scores, anchors, image_shapes)): + # remove low scoring boxes + scores_per_image = scores_per_image.flatten() + keep_idxs = scores_per_image > self.score_thresh + scores_per_image = scores_per_image[keep_idxs] + topk_idxs = torch.where(keep_idxs)[0] + + # keep only topk scoring predictions + num_topk = min(self.detections_per_img, topk_idxs.size(0)) + scores_per_image, idxs = scores_per_image.topk(num_topk) + topk_idxs = topk_idxs[idxs] - boxes_per_image = self.box_coder.decode_single(box_regression_per_image, anchors_per_image) + anchor_idxs = topk_idxs // num_classes + labels_per_image = topk_idxs % num_classes + + boxes_per_image = self.box_coder.decode_single(box_regression_per_image[anchor_idxs], + anchors_per_image[anchor_idxs]) boxes_per_image = box_ops.clip_boxes_to_image(boxes_per_image, image_shape) - other_outputs_per_image = [(k, v[index]) for k, v in other_outputs.items()] - - image_boxes = [] - image_scores = [] - image_labels = [] - image_other_outputs = torch.jit.annotate(Dict[str, List[Tensor]], {}) - - for class_index in range(num_classes): - # remove low scoring boxes - inds = torch.gt(scores_per_image[:, class_index], self.score_thresh) - boxes_per_class, scores_per_class, labels_per_class = \ - boxes_per_image[inds], scores_per_image[inds, class_index], labels_per_image[inds, class_index] - other_outputs_per_class = [(k, v[inds]) for k, v in other_outputs_per_image] - - # remove empty boxes - keep = box_ops.remove_small_boxes(boxes_per_class, min_size=1e-2) - boxes_per_class, scores_per_class, labels_per_class = \ - boxes_per_class[keep], scores_per_class[keep], labels_per_class[keep] - other_outputs_per_class = [(k, v[keep]) for k, v in other_outputs_per_class] - - # non-maximum suppression, independently done per class - keep = box_ops.nms(boxes_per_class, scores_per_class, self.nms_thresh) - - # keep only topk scoring predictions - keep = keep[:self.detections_per_img] - boxes_per_class, scores_per_class, labels_per_class = \ - boxes_per_class[keep], scores_per_class[keep], labels_per_class[keep] - other_outputs_per_class = [(k, v[keep]) for k, v in other_outputs_per_class] - - image_boxes.append(boxes_per_class) - image_scores.append(scores_per_class) - image_labels.append(labels_per_class) - - for k, v in other_outputs_per_class: - if k not in image_other_outputs: - image_other_outputs[k] = [] - image_other_outputs[k].append(v) - - detections.append({ - 'boxes': torch.cat(image_boxes, dim=0), - 'scores': torch.cat(image_scores, dim=0), - 'labels': torch.cat(image_labels, dim=0), - }) - - for k, v in image_other_outputs.items(): - detections[-1].update({k: torch.cat(v, dim=0)}) + # non-maximum suppression + keep = box_ops.batched_nms(boxes_per_image, scores_per_image, labels_per_image, self.nms_thresh) + + det = { + 'boxes': boxes_per_image[keep], + 'scores': scores_per_image[keep], + 'labels': labels_per_image[keep], + } + for k, v in other_outputs.items(): + det[k] = v[index][keep] + + detections.append(det) return detections From 1001612b5681aa9855ca28843798b2c8b6b9e0fd Mon Sep 17 00:00:00 2001 From: Vasileios Vryniotis Date: Sun, 18 Oct 2020 13:03:10 +0100 Subject: [PATCH 2/8] Remove unnecessary other_outputs variable. --- torchvision/models/detection/retinanet.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py index 32ecf687d1b..d316575a8b1 100644 --- a/torchvision/models/detection/retinanet.py +++ b/torchvision/models/detection/retinanet.py @@ -409,10 +409,9 @@ def compute_loss(self, targets, head_outputs, anchors): def postprocess_detections(self, head_outputs, anchors, image_shapes): # type: (Dict[str, Tensor], List[Tensor], List[Tuple[int, int]]) -> List[Dict[str, Tensor]] - # TODO: Merge this with roi_heads.RoIHeads.postprocess_detections ? - class_logits = head_outputs.pop('cls_logits') - box_regression = head_outputs.pop('bbox_regression') - other_outputs = head_outputs + # TODO: confirm that RetinaNet can't have other outputs like masks + class_logits = head_outputs['cls_logits'] + box_regression = head_outputs['bbox_regression'] num_classes = class_logits.shape[-1] @@ -443,15 +442,11 @@ def postprocess_detections(self, head_outputs, anchors, image_shapes): # non-maximum suppression keep = box_ops.batched_nms(boxes_per_image, scores_per_image, labels_per_image, self.nms_thresh) - det = { + detections.append({ 'boxes': boxes_per_image[keep], 'scores': scores_per_image[keep], 'labels': labels_per_image[keep], - } - for k, v in other_outputs.items(): - det[k] = v[index][keep] - - detections.append(det) + }) return detections From 42d2661468a435981c063533eccddc0125a51630 Mon Sep 17 00:00:00 2001 From: Vasileios Vryniotis Date: Sun, 18 Oct 2020 13:52:43 +0100 Subject: [PATCH 3/8] Split per feature level. --- ...ter.test_retinanet_resnet50_fpn_expect.pkl | Bin 1357 -> 1357 bytes torchvision/models/detection/retinanet.py | 82 ++++++++++++------ 2 files changed, 55 insertions(+), 27 deletions(-) diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl index 59cae788ac44a4766df5bd387a1daeb6947d2381..843a1d27d8f7a9cd32cf59d1b7d88bcc53918d88 100644 GIT binary patch delta 420 zcmX@hb(U+wa&A*|LrWtAVFfa(&9k~1>PQuR|3OA_@BO$>~X{DW>1$PE%etssnQlENI^MnQsm9m{+G DHqlx4 delta 419 zcmX@hb(U+wa&8L?BNGE-3nK$dvx%G1pj>lvGYf;s%8Vg!X$wQM$xVz}EOS*~7f;^6 zXl?}22Uh6j#KjN=2TUfRHiFEdHu3*MYy!O5IU3Cz{_-#|Fz`&aWs;w~gGrbl$x1gT zHHOI#f#M&S*e44xDM%PBDRL6XedaV(c8yc}Dn@6J2@f{CK0Mii$=nd85$G5 List[Dict[str, Tensor]] - # TODO: confirm that RetinaNet can't have other outputs like masks + # type: (Dict[str, List[Tensor]], List[List[Tensor]], List[Tuple[int, int]]) -> List[Dict[str, Tensor]] class_logits = head_outputs['cls_logits'] box_regression = head_outputs['bbox_regression'] - num_classes = class_logits.shape[-1] - - scores = torch.sigmoid(class_logits) + num_images = len(image_shapes) detections = torch.jit.annotate(List[Dict[str, Tensor]], []) - for index, (box_regression_per_image, scores_per_image, anchors_per_image, image_shape) in \ - enumerate(zip(box_regression, scores, anchors, image_shapes)): - # remove low scoring boxes - scores_per_image = scores_per_image.flatten() - keep_idxs = scores_per_image > self.score_thresh - scores_per_image = scores_per_image[keep_idxs] - topk_idxs = torch.where(keep_idxs)[0] + for index in range(num_images): + box_regression_per_image = [br[index] for br in box_regression] + logits_per_image = [cl[index] for cl in class_logits] + anchors_per_image, image_shape = anchors[index], image_shapes[index] + + image_boxes = [] + image_scores = [] + image_labels = [] + + for box_regression_per_level, logits_per_level, anchors_per_level in \ + zip(box_regression_per_image, logits_per_image, anchors_per_image): + num_classes = logits_per_level.shape[-1] - # keep only topk scoring predictions - num_topk = min(self.detections_per_img, topk_idxs.size(0)) - scores_per_image, idxs = scores_per_image.topk(num_topk) - topk_idxs = topk_idxs[idxs] + # remove low scoring boxes + scores_per_level = torch.sigmoid(logits_per_level).flatten() + keep_idxs = scores_per_level > self.score_thresh + scores_per_level = scores_per_level[keep_idxs] + topk_idxs = torch.where(keep_idxs)[0] - anchor_idxs = topk_idxs // num_classes - labels_per_image = topk_idxs % num_classes + # keep only topk scoring predictions + num_topk = min(self.detections_per_img, topk_idxs.size(0)) + scores_per_level, idxs = scores_per_level.topk(num_topk) + topk_idxs = topk_idxs[idxs] - boxes_per_image = self.box_coder.decode_single(box_regression_per_image[anchor_idxs], - anchors_per_image[anchor_idxs]) - boxes_per_image = box_ops.clip_boxes_to_image(boxes_per_image, image_shape) + anchor_idxs = topk_idxs // num_classes + labels_per_level = topk_idxs % num_classes - # non-maximum suppression - keep = box_ops.batched_nms(boxes_per_image, scores_per_image, labels_per_image, self.nms_thresh) + boxes_per_level = self.box_coder.decode_single(box_regression_per_level[anchor_idxs], + anchors_per_level[anchor_idxs]) + boxes_per_level = box_ops.clip_boxes_to_image(boxes_per_level, image_shape) + + # non-maximum suppression + keep = box_ops.batched_nms(boxes_per_level, scores_per_level, labels_per_level, self.nms_thresh) + + image_boxes.append(boxes_per_level[keep]) + image_scores.append(scores_per_level[keep]) + image_labels.append(labels_per_level[keep]) detections.append({ - 'boxes': boxes_per_image[keep], - 'scores': scores_per_image[keep], - 'labels': labels_per_image[keep], + 'boxes': torch.cat(image_boxes, dim=0), + 'scores': torch.cat(image_scores, dim=0), + 'labels': torch.cat(image_labels, dim=0), }) return detections @@ -526,8 +538,24 @@ def forward(self, images, targets=None): # compute the losses losses = self.compute_loss(targets, head_outputs, anchors) else: + # recover level sizes + feature_sizes_per_level = [x.size(2) * x.size(3) for x in features] + HW = 0 + for v in feature_sizes_per_level: + HW += v + HWA = head_outputs['cls_logits'].size(1) + A = HWA // HW + feature_sizes_per_level = [hw * A for hw in feature_sizes_per_level] + + # split outputs per level + split_head_outputs: Dict[str, List[Tensor]] = {} + for k in head_outputs: + split_head_outputs[k] = [x.permute(1, 0, 2) for x in + head_outputs[k].permute(1, 0, 2).split_with_sizes(feature_sizes_per_level)] + split_anchors = [list(a.split_with_sizes(feature_sizes_per_level)) for a in anchors] + # compute the detections - detections = self.postprocess_detections(head_outputs, anchors, images.image_sizes) + detections = self.postprocess_detections(split_head_outputs, split_anchors, images.image_sizes) detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes) if torch.jit.is_scripting(): From 61e0bf6a4db266c241b22e874cad8757f7bc31bb Mon Sep 17 00:00:00 2001 From: Vasileios Vryniotis Date: Sun, 18 Oct 2020 15:09:07 +0100 Subject: [PATCH 4/8] Perform batched_nms across feature levels. --- ...ter.test_retinanet_resnet50_fpn_expect.pkl | Bin 1357 -> 1357 bytes torchvision/models/detection/retinanet.py | 20 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl index 843a1d27d8f7a9cd32cf59d1b7d88bcc53918d88..548b0a22e1c1c46f1b9fc4d3a4b99a2d5ad542c0 100644 GIT binary patch delta 483 zcmX@hb(U+wa&9AYV?#?53qwOwqluf+fLtRJBXbKgQ$wT4%8VgUX){Zp^yDT+EtWg` z-<_MhfzjLuNuiq)7ef>rFqwqf2r`G-#QzVm3Gim;Xf$*9%frCHz%$vFNq+JUCSiVv zm0&a7oYWX5KLm<@U}9Hb0jdH47|jIXGJs5JmA@P_S%FzeMTS9+A+abqBeN`3KP9mw zQQy$S0O%OFC)}K*Kqg8}7GP3PFj!LLB#`^eX{ziRr}kBh&LEQ>YwUl0p5&YM{;mLoClN*?oAYNX^Y{d!p!Q_|BYMDp@5a7)SwvB@WVi&S2WuVqWC diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py index b5b3ea95077..d0081dd7751 100644 --- a/torchvision/models/detection/retinanet.py +++ b/torchvision/models/detection/retinanet.py @@ -447,17 +447,21 @@ def postprocess_detections(self, head_outputs, anchors, image_shapes): anchors_per_level[anchor_idxs]) boxes_per_level = box_ops.clip_boxes_to_image(boxes_per_level, image_shape) - # non-maximum suppression - keep = box_ops.batched_nms(boxes_per_level, scores_per_level, labels_per_level, self.nms_thresh) + image_boxes.append(boxes_per_level) + image_scores.append(scores_per_level) + image_labels.append(labels_per_level) - image_boxes.append(boxes_per_level[keep]) - image_scores.append(scores_per_level[keep]) - image_labels.append(labels_per_level[keep]) + image_boxes = torch.cat(image_boxes, dim=0) + image_scores = torch.cat(image_scores, dim=0) + image_labels = torch.cat(image_labels, dim=0) + + # non-maximum suppression + keep = box_ops.batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh) detections.append({ - 'boxes': torch.cat(image_boxes, dim=0), - 'scores': torch.cat(image_scores, dim=0), - 'labels': torch.cat(image_labels, dim=0), + 'boxes': image_boxes[keep], + 'scores': image_scores[keep], + 'labels': image_labels[keep], }) return detections From 5d58d1af7a046e96a7d430d12efddf11f5f80367 Mon Sep 17 00:00:00 2001 From: Vasileios Vryniotis Date: Mon, 19 Oct 2020 10:18:49 +0100 Subject: [PATCH 5/8] Add extra parameter for limiting detections before and after nms. --- torchvision/models/detection/retinanet.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py index d0081dd7751..b0086d0e2ea 100644 --- a/torchvision/models/detection/retinanet.py +++ b/torchvision/models/detection/retinanet.py @@ -291,6 +291,7 @@ class RetinaNet(nn.Module): considered as positive during training. bg_iou_thresh (float): maximum IoU between the anchor and the GT box so that they can be considered as negative during training. + topk_candidates (int): Number of best detections to keep before NMS. Example: @@ -338,8 +339,9 @@ def __init__(self, backbone, num_classes, proposal_matcher=None, score_thresh=0.05, nms_thresh=0.5, - detections_per_img=300, - fg_iou_thresh=0.5, bg_iou_thresh=0.4): + detections_per_img=100, + fg_iou_thresh=0.5, bg_iou_thresh=0.4, + topk_candidates=1000): super().__init__() if not hasattr(backbone, "out_channels"): @@ -382,6 +384,7 @@ def __init__(self, backbone, num_classes, self.score_thresh = score_thresh self.nms_thresh = nms_thresh self.detections_per_img = detections_per_img + self.topk_candidates = topk_candidates # used only on torchscript mode self._has_warned = False @@ -436,7 +439,7 @@ def postprocess_detections(self, head_outputs, anchors, image_shapes): topk_idxs = torch.where(keep_idxs)[0] # keep only topk scoring predictions - num_topk = min(self.detections_per_img, topk_idxs.size(0)) + num_topk = min(self.topk_candidates, topk_idxs.size(0)) scores_per_level, idxs = scores_per_level.topk(num_topk) topk_idxs = topk_idxs[idxs] @@ -457,6 +460,7 @@ def postprocess_detections(self, head_outputs, anchors, image_shapes): # non-maximum suppression keep = box_ops.batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh) + keep = keep[:self.detections_per_img] detections.append({ 'boxes': image_boxes[keep], From b639ec01ac9fcd46fb33a360c289f234b591b12c Mon Sep 17 00:00:00 2001 From: Vasileios Vryniotis Date: Mon, 19 Oct 2020 15:23:10 +0100 Subject: [PATCH 6/8] Restoring default threshold. --- torchvision/models/detection/retinanet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py index b0086d0e2ea..94482b26baf 100644 --- a/torchvision/models/detection/retinanet.py +++ b/torchvision/models/detection/retinanet.py @@ -339,7 +339,7 @@ def __init__(self, backbone, num_classes, proposal_matcher=None, score_thresh=0.05, nms_thresh=0.5, - detections_per_img=100, + detections_per_img=300, fg_iou_thresh=0.5, bg_iou_thresh=0.4, topk_candidates=1000): super().__init__() From 14e39d2eab1bcbf09042f1d2d7fe2ae035fb9273 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 19 Oct 2020 21:40:56 +0100 Subject: [PATCH 7/8] Apply suggestions from code review Co-authored-by: Francisco Massa --- torchvision/models/detection/retinanet.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py index 94482b26baf..72de91d423b 100644 --- a/torchvision/models/detection/retinanet.py +++ b/torchvision/models/detection/retinanet.py @@ -558,9 +558,8 @@ def forward(self, images, targets=None): # split outputs per level split_head_outputs: Dict[str, List[Tensor]] = {} for k in head_outputs: - split_head_outputs[k] = [x.permute(1, 0, 2) for x in - head_outputs[k].permute(1, 0, 2).split_with_sizes(feature_sizes_per_level)] - split_anchors = [list(a.split_with_sizes(feature_sizes_per_level)) for a in anchors] + split_head_outputs[k] = list(head_outputs[k].split(feature_sizes_per_level, dim=1)) + split_anchors = [list(a.split(feature_sizes_per_level)) for a in anchors] # compute the detections detections = self.postprocess_detections(split_head_outputs, split_anchors, images.image_sizes) From 6266e1cf6fe8a24c0ed4c49af7cbe9f0f8ef8e55 Mon Sep 17 00:00:00 2001 From: Vasileios Vryniotis Date: Mon, 19 Oct 2020 21:48:10 +0100 Subject: [PATCH 8/8] Renaming variable. --- torchvision/models/detection/retinanet.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py index 72de91d423b..d128ecb5699 100644 --- a/torchvision/models/detection/retinanet.py +++ b/torchvision/models/detection/retinanet.py @@ -547,19 +547,19 @@ def forward(self, images, targets=None): losses = self.compute_loss(targets, head_outputs, anchors) else: # recover level sizes - feature_sizes_per_level = [x.size(2) * x.size(3) for x in features] + num_anchors_per_level = [x.size(2) * x.size(3) for x in features] HW = 0 - for v in feature_sizes_per_level: + for v in num_anchors_per_level: HW += v HWA = head_outputs['cls_logits'].size(1) A = HWA // HW - feature_sizes_per_level = [hw * A for hw in feature_sizes_per_level] + num_anchors_per_level = [hw * A for hw in num_anchors_per_level] # split outputs per level split_head_outputs: Dict[str, List[Tensor]] = {} for k in head_outputs: - split_head_outputs[k] = list(head_outputs[k].split(feature_sizes_per_level, dim=1)) - split_anchors = [list(a.split(feature_sizes_per_level)) for a in anchors] + split_head_outputs[k] = list(head_outputs[k].split(num_anchors_per_level, dim=1)) + split_anchors = [list(a.split(num_anchors_per_level)) for a in anchors] # compute the detections detections = self.postprocess_detections(split_head_outputs, split_anchors, images.image_sizes)