Output reshaped outputs from retinanet heads.

hgaiser · hgaiser · commit 0de780d42a0b · 2020-02-07T14:06:49.000+01:00
diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
@@ -42,9 +42,10 @@ def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
         }
 
     def forward(self, x):
-        cls_logits = [self.classification_head(feature) for feature in x]
-        bbox_reg = [self.regression_head(feature) for feature in x]
-        return dict(cls_logits=cls_logits, bbox_reg=bbox_reg)
+        return {
+            'cls_logits': self.classification_head(x),
+            'bbox_regression': self.regression_head(x)
+        }
 
 
 def sigmoid_focal_loss(
@@ -127,45 +128,48 @@ def __init__(self, in_channels, num_anchors, num_classes, prior_probability=0.01
     def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
         loss = []
 
-        def permute_classification(tensor):
-            """ Permute classification output from (N, A * K, H, W) to (N, HWA, K). """
-            N, _, H, W = tensor.shape
-            tensor = tensor.view(N, -1, self.num_classes, H, W)
-            tensor = tensor.permute(0, 3, 4, 1, 2)
-            tensor = tensor.reshape(N, -1, self.num_classes)  # Size=(N, HWA, 4)
-            return tensor
-
-        predicted_classification = head_outputs['cls_logits']
-        predicted_classification = [permute_classification(cls) for cls in predicted_classification]
-        predicted_classification = torch.cat(predicted_classification, dim=1)
+        cls_logits = head_outputs['cls_logits']
 
-        for targets_per_image, predicted_classification_per_image, anchors_per_image, matched_idxs_per_image in zip(targets, predicted_classification, anchors, matched_idxs):
+        for targets_per_image, cls_logits_per_image, anchors_per_image, matched_idxs_per_image in zip(targets, cls_logits, anchors, matched_idxs):
             # determine only the foreground
             foreground_idxs_per_image = matched_idxs_per_image >= 0
             num_foreground = foreground_idxs_per_image.sum()
 
             # create the target classification
-            gt_classes_target = torch.zeros_like(predicted_classification_per_image)
+            gt_classes_target = torch.zeros_like(cls_logits_per_image)
             gt_classes_target[foreground_idxs_per_image, targets_per_image['labels'][matched_idxs_per_image[foreground_idxs_per_image]]] = 1
 
             # find indices for which anchors should be ignored
             valid_idxs_per_image = matched_idxs_per_image != det_utils.Matcher.BETWEEN_THRESHOLDS
 
             # compute the classification loss
-            loss.append(sigmoid_focal_loss_jit(
-                predicted_classification_per_image[valid_idxs_per_image],
+            loss.append(sigmoid_focal_loss(
+                cls_logits_per_image[valid_idxs_per_image],
                 gt_classes_target[valid_idxs_per_image],
                 reduction='sum',
             ) / max(1, num_foreground))
 
         return sum(loss) / len(loss)
 
     def forward(self, x):
-        x = F.relu(self.conv1(x))
-        x = F.relu(self.conv2(x))
-        x = F.relu(self.conv3(x))
-        x = F.relu(self.conv4(x))
-        return self.cls_logits(x)
+        all_cls_logits = []
+
+        for features in x:
+            cls_logits = F.relu(self.conv1(features))
+            cls_logits = F.relu(self.conv2(cls_logits))
+            cls_logits = F.relu(self.conv3(cls_logits))
+            cls_logits = F.relu(self.conv4(cls_logits))
+            cls_logits = self.cls_logits(cls_logits)
+
+            # Permute classification output from (N, A * K, H, W) to (N, HWA, K).
+            N, _, H, W = cls_logits.shape
+            cls_logits = cls_logits.view(N, -1, self.num_classes, H, W)
+            cls_logits = cls_logits.permute(0, 3, 4, 1, 2)
+            cls_logits = cls_logits.reshape(N, -1, self.num_classes)  # Size=(N, HWA, 4)
+
+            all_cls_logits.append(cls_logits)
+
+        return torch.cat(all_cls_logits, dim=1)
 
 
 class RetinaNetRegressionHead(nn.Module):
@@ -194,19 +198,9 @@ def __init__(self, in_channels, num_anchors):
     def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
         loss = []
 
-        def permute_bbox_reg(tensor):
-            """ Permute bbox regression output from (N, 4 * A, H, W) to (N, HWA, 4). """
-            N, _, H, W = tensor.shape
-            tensor = tensor.view(N, -1, 4, H, W)
-            tensor = tensor.permute(0, 3, 4, 1, 2)
-            tensor = tensor.reshape(N, -1, 4)  # Size=(N, HWA, 4)
-            return tensor
-
-        predicted_regression = head_outputs['bbox_reg']
-        predicted_regression = [permute_bbox_reg(reg) for reg in predicted_regression]
-        predicted_regression = torch.cat(predicted_regression, dim=1)
+        bbox_regression = head_outputs['bbox_regression']
 
-        for targets_per_image, predicted_regression_per_image, anchors_per_image, matched_idxs_per_image in zip(targets, predicted_regression, anchors, matched_idxs):
+        for targets_per_image, bbox_regression_per_image, anchors_per_image, matched_idxs_per_image in zip(targets, bbox_regression, anchors, matched_idxs):
             # get the targets corresponding GT for each proposal
             # NB: need to clamp the indices because we can have a single
             # GT in the image, and matched_idxs can be -2, which goes
@@ -219,23 +213,36 @@ def permute_bbox_reg(tensor):
 
             # select only the foreground boxes
             matched_gt_boxes_per_image = matched_gt_boxes_per_image[foreground_idxs_per_image, :]
-            predicted_regression_per_image = predicted_regression_per_image[foreground_idxs_per_image, :]
+            bbox_regression_per_image = bbox_regression_per_image[foreground_idxs_per_image, :]
             anchors_per_image = anchors_per_image[foreground_idxs_per_image, :]
 
             # compute the regression targets
             target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image)
 
             # compute the loss
-            loss.append(torch.nn.SmoothL1Loss(reduction='sum')(predicted_regression_per_image, target_regression) / max(1, num_foreground))
+            loss.append(torch.nn.SmoothL1Loss(reduction='sum')(bbox_regression_per_image, target_regression) / max(1, num_foreground))
 
         return sum(loss) / max(1, len(loss))
 
     def forward(self, x):
-        x = F.relu(self.conv1(x))
-        x = F.relu(self.conv2(x))
-        x = F.relu(self.conv3(x))
-        x = F.relu(self.conv4(x))
-        return self.bbox_reg(x)
+        all_bbox_regression = []
+
+        for features in x:
+            bbox_regression = F.relu(self.conv1(features))
+            bbox_regression = F.relu(self.conv2(bbox_regression))
+            bbox_regression = F.relu(self.conv3(bbox_regression))
+            bbox_regression = F.relu(self.conv4(bbox_regression))
+            bbox_regression = self.bbox_reg(bbox_regression)
+
+            # Permute bbox regression output from (N, 4 * A, H, W) to (N, HWA, 4).
+            N, _, H, W = bbox_regression.shape
+            bbox_regression = bbox_regression.view(N, -1, 4, H, W)
+            bbox_regression = bbox_regression.permute(0, 3, 4, 1, 2)
+            bbox_regression = bbox_regression.reshape(N, -1, 4)  # Size=(N, HWA, 4)
+
+            all_bbox_regression.append(bbox_regression)
+
+        return torch.cat(all_bbox_regression, dim=1)
 
 
 class RetinaNet(nn.Module):