From a3491d289373367036aa90dfdad0e96ec1767086 Mon Sep 17 00:00:00 2001
From: Vasileios Vryniotis <vvryniotis@fb.com>
Date: Fri, 16 Oct 2020 19:35:20 +0100
Subject: [PATCH 1/8] Vectorize operations, across all feaure levels.

---
 .gitignore                                    |   1 +
 ...ter.test_retinanet_resnet50_fpn_expect.pkl | Bin 1357 -> 1357 bytes
 torchvision/models/detection/retinanet.py     |  84 ++++++------------
 3 files changed, 30 insertions(+), 55 deletions(-)
diff --git a/.gitignore b/.gitignore
index 4ed0749da06..c332dda4f4a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,5 +22,6 @@ htmlcov
 gen.yml
 .mypy_cache
 .vscode/
+.idea/
 *.orig
 *-checkpoint.ipynb
\ No newline at end of file
diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl
index 5b5079f20f882e18dccadc790c6cab0c5cb88cb2..59cae788ac44a4766df5bd387a1daeb6947d2381 100644
GIT binary patch
delta 411
zcmX@hb(U+wG9Duf3nLQ)V+$h#OS6faQek{^b2AHr$;ynuFaZlA3q!NXO^jMBb5&m#
zPu{?2W(?H|R_f-&#SjGtOeUc=g3O^d@&7|?0=(Hd8qFO3@-Q$k@JzO4lApYTNk{<M
zS~n*(hRF|s5+9h@CkrqsNEj?BauUdW<}_7yjZ^z7MrV)_4>r9%JlTTD%m}U%=pHvG
zCZMe>K$AfLMl(UU3?OZ-@|R;KD=;@fT)m9hiUs7B$$BhelYcRb1S5MPz?%`QlY;}I
zA7rBekO{&vP$MCf3{YKSQF2CRS*m_YVo9RDp@{+9S(6i4gsni<N&ppuFsj}I`1FE;
Icpb}p05SYut^fc4

delta 412
zcmX@hb(U+wG9F_iV?$$e6JrZYOQVUKQlWfPBLf3Vv&qVg!3Y5pGn2_pj9M)3W_{Z;
zc>|-F7P3}1CoYC4I51dJ<Rp;$%xS9Z8mIPEjLwrAm=rhyyxBRbU)R5$yo1S18=@Ct
zjhhn_P;dPI5E~|wP#Zz!P#ciawkZn5JPZsBJd^h_$xpUm5)y#x0lLJ^NsVE0B2c1$
ziCuvSs0;*HAQYSi8588vs5M!Exe4OzWz1GAU^nQoh)w>*EE0_DhX8Lzut^*o5Od%z
z0CHuZMnfnEpt{7O<c!R+RQ;61l0<z&69c&GCMU88TY;>V04fGyRK0WX=>>)HI+pnW
D!F^t^

diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
index 770c5dcb1b0..32ecf687d1b 100644
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -410,74 +410,48 @@ def compute_loss(self, targets, head_outputs, anchors):
     def postprocess_detections(self, head_outputs, anchors, image_shapes):
         # type: (Dict[str, Tensor], List[Tensor], List[Tuple[int, int]]) -> List[Dict[str, Tensor]]
         # TODO: Merge this with roi_heads.RoIHeads.postprocess_detections ?
-
         class_logits = head_outputs.pop('cls_logits')
         box_regression = head_outputs.pop('bbox_regression')
         other_outputs = head_outputs
 
-        device = class_logits.device
         num_classes = class_logits.shape[-1]
 
         scores = torch.sigmoid(class_logits)
 
-        # create labels for each score
-        labels = torch.arange(num_classes, device=device)
-        labels = labels.view(1, -1).expand_as(scores)
-
         detections = torch.jit.annotate(List[Dict[str, Tensor]], [])
 
-        for index, (box_regression_per_image, scores_per_image, labels_per_image, anchors_per_image, image_shape) in \
-                enumerate(zip(box_regression, scores, labels, anchors, image_shapes)):
+        for index, (box_regression_per_image, scores_per_image, anchors_per_image, image_shape) in \
+                enumerate(zip(box_regression, scores, anchors, image_shapes)):
+            # remove low scoring boxes
+            scores_per_image = scores_per_image.flatten()
+            keep_idxs = scores_per_image > self.score_thresh
+            scores_per_image = scores_per_image[keep_idxs]
+            topk_idxs = torch.where(keep_idxs)[0]
+
+            # keep only topk scoring predictions
+            num_topk = min(self.detections_per_img, topk_idxs.size(0))
+            scores_per_image, idxs = scores_per_image.topk(num_topk)
+            topk_idxs = topk_idxs[idxs]
 
-            boxes_per_image = self.box_coder.decode_single(box_regression_per_image, anchors_per_image)
+            anchor_idxs = topk_idxs // num_classes
+            labels_per_image = topk_idxs % num_classes
+
+            boxes_per_image = self.box_coder.decode_single(box_regression_per_image[anchor_idxs],
+                                                           anchors_per_image[anchor_idxs])
             boxes_per_image = box_ops.clip_boxes_to_image(boxes_per_image, image_shape)
 
-            other_outputs_per_image = [(k, v[index]) for k, v in other_outputs.items()]
-
-            image_boxes = []
-            image_scores = []
-            image_labels = []
-            image_other_outputs = torch.jit.annotate(Dict[str, List[Tensor]], {})
-
-            for class_index in range(num_classes):
-                # remove low scoring boxes
-                inds = torch.gt(scores_per_image[:, class_index], self.score_thresh)
-                boxes_per_class, scores_per_class, labels_per_class = \
-                    boxes_per_image[inds], scores_per_image[inds, class_index], labels_per_image[inds, class_index]
-                other_outputs_per_class = [(k, v[inds]) for k, v in other_outputs_per_image]
-
-                # remove empty boxes
-                keep = box_ops.remove_small_boxes(boxes_per_class, min_size=1e-2)
-                boxes_per_class, scores_per_class, labels_per_class = \
-                    boxes_per_class[keep], scores_per_class[keep], labels_per_class[keep]
-                other_outputs_per_class = [(k, v[keep]) for k, v in other_outputs_per_class]
-
-                # non-maximum suppression, independently done per class
-                keep = box_ops.nms(boxes_per_class, scores_per_class, self.nms_thresh)
-
-                # keep only topk scoring predictions
-                keep = keep[:self.detections_per_img]
-                boxes_per_class, scores_per_class, labels_per_class = \
-                    boxes_per_class[keep], scores_per_class[keep], labels_per_class[keep]
-                other_outputs_per_class = [(k, v[keep]) for k, v in other_outputs_per_class]
-
-                image_boxes.append(boxes_per_class)
-                image_scores.append(scores_per_class)
-                image_labels.append(labels_per_class)
-
-                for k, v in other_outputs_per_class:
-                    if k not in image_other_outputs:
-                        image_other_outputs[k] = []
-                    image_other_outputs[k].append(v)
-
-            detections.append({
-                'boxes': torch.cat(image_boxes, dim=0),
-                'scores': torch.cat(image_scores, dim=0),
-                'labels': torch.cat(image_labels, dim=0),
-            })
-
-            for k, v in image_other_outputs.items():
-                detections[-1].update({k: torch.cat(v, dim=0)})
+            # non-maximum suppression
+            keep = box_ops.batched_nms(boxes_per_image, scores_per_image, labels_per_image, self.nms_thresh)
+
+            det = {
+                'boxes': boxes_per_image[keep],
+                'scores': scores_per_image[keep],
+                'labels': labels_per_image[keep],
+            }
+            for k, v in other_outputs.items():
+                det[k] = v[index][keep]
+
+            detections.append(det)
 
         return detections
 

From 1001612b5681aa9855ca28843798b2c8b6b9e0fd Mon Sep 17 00:00:00 2001
From: Vasileios Vryniotis <vvryniotis@fb.com>
Date: Sun, 18 Oct 2020 13:03:10 +0100
Subject: [PATCH 2/8] Remove unnecessary other_outputs variable.

---
 torchvision/models/detection/retinanet.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
index 32ecf687d1b..d316575a8b1 100644
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -409,10 +409,9 @@ def compute_loss(self, targets, head_outputs, anchors):
 
     def postprocess_detections(self, head_outputs, anchors, image_shapes):
         # type: (Dict[str, Tensor], List[Tensor], List[Tuple[int, int]]) -> List[Dict[str, Tensor]]
-        # TODO: Merge this with roi_heads.RoIHeads.postprocess_detections ?
-        class_logits = head_outputs.pop('cls_logits')
-        box_regression = head_outputs.pop('bbox_regression')
-        other_outputs = head_outputs
+        # TODO: confirm that RetinaNet can't have other outputs like masks
+        class_logits = head_outputs['cls_logits']
+        box_regression = head_outputs['bbox_regression']
 
         num_classes = class_logits.shape[-1]
 
@@ -443,15 +442,11 @@ def postprocess_detections(self, head_outputs, anchors, image_shapes):
             # non-maximum suppression
             keep = box_ops.batched_nms(boxes_per_image, scores_per_image, labels_per_image, self.nms_thresh)
 
-            det = {
+            detections.append({
                 'boxes': boxes_per_image[keep],
                 'scores': scores_per_image[keep],
                 'labels': labels_per_image[keep],
-            }
-            for k, v in other_outputs.items():
-                det[k] = v[index][keep]
-
-            detections.append(det)
+            })
 
         return detections
 

From 42d2661468a435981c063533eccddc0125a51630 Mon Sep 17 00:00:00 2001
From: Vasileios Vryniotis <vvryniotis@fb.com>
Date: Sun, 18 Oct 2020 13:52:43 +0100
Subject: [PATCH 3/8] Split per feature level.

---
 ...ter.test_retinanet_resnet50_fpn_expect.pkl | Bin 1357 -> 1357 bytes
 torchvision/models/detection/retinanet.py     |  82 ++++++++++++------
 2 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl
index 59cae788ac44a4766df5bd387a1daeb6947d2381..843a1d27d8f7a9cd32cf59d1b7d88bcc53918d88 100644
GIT binary patch
delta 420
zcmX@hb(U+wa&A*|LrWtAV<Qs_gNd8cAY4OZLsK)0$;yl&P`;_9sfEepCPpn5{v4;?
z$r~8WHIekWIdL&W!GXb&A}4{|XHHXP*EqGWVsxI|z@)$t;LXla{ks13<Q+`rT41dZ
zJKUU@fLi1KhuAQggxUx)huVNtwoOqe=3!u9;F-LaNq({glQ2I-3)mTMPHGI36M^Cd
zOza9wKvf{X0-@kE$dn+LMy<&T%uNtCFJrc10lPqtMQrjfX3=^i4+MBKf(_!}fEWX}
zAIOz~nhc>Ffa(&9k~1>PQuR|3OA_@BO$>~X{DW>1$PE%etssnQlENI^MnQsm9m{+G
DHqlx4

delta 419
zcmX@hb(U+wa&8L?BNGE-3nK$dvx%G1pj>lvGYf;s%8Vg!X$wQM$xVz}EOS*~7f;^6
zXl?}22Uh6j#KjN=2TUfRHiFEdHu3*MYy!O5IU3Cz{_-#|Fz`&aWs;w~gGrbl$x1gT
zHHOI#f#M&S*e44xDM%PBDRL6XedaV(c8yc}Dn@6J2@f{CK0Mii$=nd85$G5<Cnlhs
zEI@NX07f%GxC|g&t@4*+CMz&EL7cpd*@^|^lF523Vv~O{i`FB#Kfs$2tdWBQq8((P
z0gws8GEfsClnhW^Vo`ENW?8C!N@7W(zM+W$G$g<wh;9<d4H7`DAdG5~!U23nfkJy7
G%X|R5WnQHK

diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
index d316575a8b1..b5b3ea95077 100644
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -408,44 +408,56 @@ def compute_loss(self, targets, head_outputs, anchors):
         return self.head.compute_loss(targets, head_outputs, anchors, matched_idxs)
 
     def postprocess_detections(self, head_outputs, anchors, image_shapes):
-        # type: (Dict[str, Tensor], List[Tensor], List[Tuple[int, int]]) -> List[Dict[str, Tensor]]
-        # TODO: confirm that RetinaNet can't have other outputs like masks
+        # type: (Dict[str, List[Tensor]], List[List[Tensor]], List[Tuple[int, int]]) -> List[Dict[str, Tensor]]
         class_logits = head_outputs['cls_logits']
         box_regression = head_outputs['bbox_regression']
 
-        num_classes = class_logits.shape[-1]
-
-        scores = torch.sigmoid(class_logits)
+        num_images = len(image_shapes)
 
         detections = torch.jit.annotate(List[Dict[str, Tensor]], [])
 
-        for index, (box_regression_per_image, scores_per_image, anchors_per_image, image_shape) in \
-                enumerate(zip(box_regression, scores, anchors, image_shapes)):
-            # remove low scoring boxes
-            scores_per_image = scores_per_image.flatten()
-            keep_idxs = scores_per_image > self.score_thresh
-            scores_per_image = scores_per_image[keep_idxs]
-            topk_idxs = torch.where(keep_idxs)[0]
+        for index in range(num_images):
+            box_regression_per_image = [br[index] for br in box_regression]
+            logits_per_image = [cl[index] for cl in class_logits]
+            anchors_per_image, image_shape = anchors[index], image_shapes[index]
+
+            image_boxes = []
+            image_scores = []
+            image_labels = []
+
+            for box_regression_per_level, logits_per_level, anchors_per_level in \
+                    zip(box_regression_per_image, logits_per_image, anchors_per_image):
+                num_classes = logits_per_level.shape[-1]
 
-            # keep only topk scoring predictions
-            num_topk = min(self.detections_per_img, topk_idxs.size(0))
-            scores_per_image, idxs = scores_per_image.topk(num_topk)
-            topk_idxs = topk_idxs[idxs]
+                # remove low scoring boxes
+                scores_per_level = torch.sigmoid(logits_per_level).flatten()
+                keep_idxs = scores_per_level > self.score_thresh
+                scores_per_level = scores_per_level[keep_idxs]
+                topk_idxs = torch.where(keep_idxs)[0]
 
-            anchor_idxs = topk_idxs // num_classes
-            labels_per_image = topk_idxs % num_classes
+                # keep only topk scoring predictions
+                num_topk = min(self.detections_per_img, topk_idxs.size(0))
+                scores_per_level, idxs = scores_per_level.topk(num_topk)
+                topk_idxs = topk_idxs[idxs]
 
-            boxes_per_image = self.box_coder.decode_single(box_regression_per_image[anchor_idxs],
-                                                           anchors_per_image[anchor_idxs])
-            boxes_per_image = box_ops.clip_boxes_to_image(boxes_per_image, image_shape)
+                anchor_idxs = topk_idxs // num_classes
+                labels_per_level = topk_idxs % num_classes
 
-            # non-maximum suppression
-            keep = box_ops.batched_nms(boxes_per_image, scores_per_image, labels_per_image, self.nms_thresh)
+                boxes_per_level = self.box_coder.decode_single(box_regression_per_level[anchor_idxs],
+                                                               anchors_per_level[anchor_idxs])
+                boxes_per_level = box_ops.clip_boxes_to_image(boxes_per_level, image_shape)
+
+                # non-maximum suppression
+                keep = box_ops.batched_nms(boxes_per_level, scores_per_level, labels_per_level, self.nms_thresh)
+
+                image_boxes.append(boxes_per_level[keep])
+                image_scores.append(scores_per_level[keep])
+                image_labels.append(labels_per_level[keep])
 
             detections.append({
-                'boxes': boxes_per_image[keep],
-                'scores': scores_per_image[keep],
-                'labels': labels_per_image[keep],
+                'boxes': torch.cat(image_boxes, dim=0),
+                'scores': torch.cat(image_scores, dim=0),
+                'labels': torch.cat(image_labels, dim=0),
             })
 
         return detections
@@ -526,8 +538,24 @@ def forward(self, images, targets=None):
             # compute the losses
             losses = self.compute_loss(targets, head_outputs, anchors)
         else:
+            # recover level sizes
+            feature_sizes_per_level = [x.size(2) * x.size(3) for x in features]
+            HW = 0
+            for v in feature_sizes_per_level:
+                HW += v
+            HWA = head_outputs['cls_logits'].size(1)
+            A = HWA // HW
+            feature_sizes_per_level = [hw * A for hw in feature_sizes_per_level]
+
+            # split outputs per level
+            split_head_outputs: Dict[str, List[Tensor]] = {}
+            for k in head_outputs:
+                split_head_outputs[k] = [x.permute(1, 0, 2) for x in
+                                         head_outputs[k].permute(1, 0, 2).split_with_sizes(feature_sizes_per_level)]
+            split_anchors = [list(a.split_with_sizes(feature_sizes_per_level)) for a in anchors]
+
             # compute the detections
-            detections = self.postprocess_detections(head_outputs, anchors, images.image_sizes)
+            detections = self.postprocess_detections(split_head_outputs, split_anchors, images.image_sizes)
             detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)
 
         if torch.jit.is_scripting():

From 61e0bf6a4db266c241b22e874cad8757f7bc31bb Mon Sep 17 00:00:00 2001
From: Vasileios Vryniotis <vvryniotis@fb.com>
Date: Sun, 18 Oct 2020 15:09:07 +0100
Subject: [PATCH 4/8] Perform batched_nms across feature levels.

---
 ...ter.test_retinanet_resnet50_fpn_expect.pkl | Bin 1357 -> 1357 bytes
 torchvision/models/detection/retinanet.py     |  20 +++++++++++-------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl
index 843a1d27d8f7a9cd32cf59d1b7d88bcc53918d88..548b0a22e1c1c46f1b9fc4d3a4b99a2d5ad542c0 100644
GIT binary patch
delta 483
zcmX@hb(U+wa&9AYV?#?53qwOwqluf+fLtRJBXbKgQ$wT4%8VgUX){Zp^yDT+EtWg`
z-<_MhfzjLuNuiq)7ef>rFqwqf2r`G-#QzVm3Gim;Xf$*9%frCHz%$vFNq+JUCSiVv
zm0&a7oYWX5KLm<@U}9Hb0jdH47|jIXGJs5JmA@P_S%FzeMTS9+A+abqBeN`3KP9mw
zQQy$S0O%OFC)}K*Kqg8}7GP3PFj!LLB#`^eX{ziRr}kBh&LEQ>Y<hjz0BAA@PmmJl
zXAp*)SC(2-oSC2J=ETo1xscfq;@M@)RxBVtOx9x&oBWGev>wUl0p5&YM{;mLoC<QK
t1ds{BG7upsDFal8!`a|~KsO25m8d2utiWv)I2tFfW6@%Cn0%9^1OVxMaV-D<

delta 472
zcmX@hb(U+wa&A*|LrWtAV<Qs_gNd8cAY4OZLsK)0$;yl&P`;_9sfEepCPpn5{v4;?
z$r~8WHIekWIdL&W!GXb&A}4{|XHHXP*EqGWVsxIwtRN8J&CXH%y8g8R0|NsHPmpqw
zVPIoOEK1JEEKAi-Ni0d!H#9LY0vigk-OY&!XlVTZ5E~|wP#Zz!P#cgTZBrDAd4L9h
z@C2#JW=x{|5ItZQx;d#aOipBypIpGiuD}FT0s<@$3QmKJ336%Fn!JEXNt~ZS8}5p-
z)S}|d{5&@&b(kR>lN*?oAYNX^Y{d!p!Q_|BYMDp@5a7)SwvB@WVi&S2WuVqWC<mZA
ee7;6E3G7fk7BPi6xHUo|XB~?cqsHW$EF}QnoN7b>

diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
index b5b3ea95077..d0081dd7751 100644
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -447,17 +447,21 @@ def postprocess_detections(self, head_outputs, anchors, image_shapes):
                                                                anchors_per_level[anchor_idxs])
                 boxes_per_level = box_ops.clip_boxes_to_image(boxes_per_level, image_shape)
 
-                # non-maximum suppression
-                keep = box_ops.batched_nms(boxes_per_level, scores_per_level, labels_per_level, self.nms_thresh)
+                image_boxes.append(boxes_per_level)
+                image_scores.append(scores_per_level)
+                image_labels.append(labels_per_level)
 
-                image_boxes.append(boxes_per_level[keep])
-                image_scores.append(scores_per_level[keep])
-                image_labels.append(labels_per_level[keep])
+            image_boxes = torch.cat(image_boxes, dim=0)
+            image_scores = torch.cat(image_scores, dim=0)
+            image_labels = torch.cat(image_labels, dim=0)
+
+            # non-maximum suppression
+            keep = box_ops.batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh)
 
             detections.append({
-                'boxes': torch.cat(image_boxes, dim=0),
-                'scores': torch.cat(image_scores, dim=0),
-                'labels': torch.cat(image_labels, dim=0),
+                'boxes': image_boxes[keep],
+                'scores': image_scores[keep],
+                'labels': image_labels[keep],
             })
 
         return detections

From 5d58d1af7a046e96a7d430d12efddf11f5f80367 Mon Sep 17 00:00:00 2001
From: Vasileios Vryniotis <vvryniotis@fb.com>
Date: Mon, 19 Oct 2020 10:18:49 +0100
Subject: [PATCH 5/8] Add extra parameter for limiting detections before and
 after nms.

---
 torchvision/models/detection/retinanet.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
index d0081dd7751..b0086d0e2ea 100644
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -291,6 +291,7 @@ class RetinaNet(nn.Module):
             considered as positive during training.
         bg_iou_thresh (float): maximum IoU between the anchor and the GT box so that they can be
             considered as negative during training.
+        topk_candidates (int): Number of best detections to keep before NMS.
 
     Example:
 
@@ -338,8 +339,9 @@ def __init__(self, backbone, num_classes,
                  proposal_matcher=None,
                  score_thresh=0.05,
                  nms_thresh=0.5,
-                 detections_per_img=300,
-                 fg_iou_thresh=0.5, bg_iou_thresh=0.4):
+                 detections_per_img=100,
+                 fg_iou_thresh=0.5, bg_iou_thresh=0.4,
+                 topk_candidates=1000):
         super().__init__()
 
         if not hasattr(backbone, "out_channels"):
@@ -382,6 +384,7 @@ def __init__(self, backbone, num_classes,
         self.score_thresh = score_thresh
         self.nms_thresh = nms_thresh
         self.detections_per_img = detections_per_img
+        self.topk_candidates = topk_candidates
 
         # used only on torchscript mode
         self._has_warned = False
@@ -436,7 +439,7 @@ def postprocess_detections(self, head_outputs, anchors, image_shapes):
                 topk_idxs = torch.where(keep_idxs)[0]
 
                 # keep only topk scoring predictions
-                num_topk = min(self.detections_per_img, topk_idxs.size(0))
+                num_topk = min(self.topk_candidates, topk_idxs.size(0))
                 scores_per_level, idxs = scores_per_level.topk(num_topk)
                 topk_idxs = topk_idxs[idxs]
 
@@ -457,6 +460,7 @@ def postprocess_detections(self, head_outputs, anchors, image_shapes):
 
             # non-maximum suppression
             keep = box_ops.batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh)
+            keep = keep[:self.detections_per_img]
 
             detections.append({
                 'boxes': image_boxes[keep],

From b639ec01ac9fcd46fb33a360c289f234b591b12c Mon Sep 17 00:00:00 2001
From: Vasileios Vryniotis <vvryniotis@fb.com>
Date: Mon, 19 Oct 2020 15:23:10 +0100
Subject: [PATCH 6/8] Restoring default threshold.

---
 torchvision/models/detection/retinanet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
index b0086d0e2ea..94482b26baf 100644
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -339,7 +339,7 @@ def __init__(self, backbone, num_classes,
                  proposal_matcher=None,
                  score_thresh=0.05,
                  nms_thresh=0.5,
-                 detections_per_img=100,
+                 detections_per_img=300,
                  fg_iou_thresh=0.5, bg_iou_thresh=0.4,
                  topk_candidates=1000):
         super().__init__()

From 14e39d2eab1bcbf09042f1d2d7fe2ae035fb9273 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Date: Mon, 19 Oct 2020 21:40:56 +0100
Subject: [PATCH 7/8] Apply suggestions from code review

Co-authored-by: Francisco Massa <fvsmassa@gmail.com>
---
 torchvision/models/detection/retinanet.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
index 94482b26baf..72de91d423b 100644
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -558,9 +558,8 @@ def forward(self, images, targets=None):
             # split outputs per level
             split_head_outputs: Dict[str, List[Tensor]] = {}
             for k in head_outputs:
-                split_head_outputs[k] = [x.permute(1, 0, 2) for x in
-                                         head_outputs[k].permute(1, 0, 2).split_with_sizes(feature_sizes_per_level)]
-            split_anchors = [list(a.split_with_sizes(feature_sizes_per_level)) for a in anchors]
+                split_head_outputs[k] = list(head_outputs[k].split(feature_sizes_per_level, dim=1))
+            split_anchors = [list(a.split(feature_sizes_per_level)) for a in anchors]
 
             # compute the detections
             detections = self.postprocess_detections(split_head_outputs, split_anchors, images.image_sizes)

From 6266e1cf6fe8a24c0ed4c49af7cbe9f0f8ef8e55 Mon Sep 17 00:00:00 2001
From: Vasileios Vryniotis <vvryniotis@fb.com>
Date: Mon, 19 Oct 2020 21:48:10 +0100
Subject: [PATCH 8/8] Renaming variable.

---
 torchvision/models/detection/retinanet.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
index 72de91d423b..d128ecb5699 100644
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -547,19 +547,19 @@ def forward(self, images, targets=None):
             losses = self.compute_loss(targets, head_outputs, anchors)
         else:
             # recover level sizes
-            feature_sizes_per_level = [x.size(2) * x.size(3) for x in features]
+            num_anchors_per_level = [x.size(2) * x.size(3) for x in features]
             HW = 0
-            for v in feature_sizes_per_level:
+            for v in num_anchors_per_level:
                 HW += v
             HWA = head_outputs['cls_logits'].size(1)
             A = HWA // HW
-            feature_sizes_per_level = [hw * A for hw in feature_sizes_per_level]
+            num_anchors_per_level = [hw * A for hw in num_anchors_per_level]
 
             # split outputs per level
             split_head_outputs: Dict[str, List[Tensor]] = {}
             for k in head_outputs:
-                split_head_outputs[k] = list(head_outputs[k].split(feature_sizes_per_level, dim=1))
-            split_anchors = [list(a.split(feature_sizes_per_level)) for a in anchors]
+                split_head_outputs[k] = list(head_outputs[k].split(num_anchors_per_level, dim=1))
+            split_anchors = [list(a.split(num_anchors_per_level)) for a in anchors]
 
             # compute the detections
             detections = self.postprocess_detections(split_head_outputs, split_anchors, images.image_sizes)