Implement loss for retinanet heads.

hgaiser · hgaiser · commit 003a9f854b56 · 2020-02-07T14:06:49.000+01:00
diff --git a/torchvision/models/detection/_utils.py b/torchvision/models/detection/_utils.py
@@ -3,7 +3,7 @@
 import math
 
 import torch
-from torch.jit.annotations import List, Tuple
+from torch.jit.annotations import List, Tuple, Optional
 from torch import Tensor
 import torchvision
 
@@ -257,8 +257,7 @@ class Matcher(object):
     def __init__(self,
                  high_threshold,
                  low_threshold,
-                 allow_low_quality_matches=False,
-                 box_similarity=None):
+                 allow_low_quality_matches=False):
         # type: (float, float, bool)
         """
         Args:
@@ -280,22 +279,24 @@ def __init__(self,
         self.low_threshold = low_threshold
         self.allow_low_quality_matches = allow_low_quality_matches
 
-        if box_similarity is None:
-           box_similarity = box_ops.box_iou
-        self.box_similarity = box_similarity
+        # if box_similarity is None:
+        #    box_similarity = box_ops.box_iou
+        # self.box_similarity = box_similarity
 
     def __call__(self, gt_boxes, anchors_per_image):
         """
         Args:
-            match_quality_matrix (Tensor[float]): an MxN tensor, containing the
-            pairwise quality between M ground-truth elements and N predicted elements.
+            gt_boxes (Tensor[float]): an Mx4 tensor, containing M detections.
+
+            anchors_per_image (Tensor[float]): an Mx4 tensor, containing
+                the anchors for a specific image.
 
         Returns:
             matches (Tensor[int64]): an N tensor where N[i] is a matched gt in
             [0, M - 1] or a negative value indicating that prediction i could not
             be matched.
         """
-        match_quality_matrix = self.box_similarity(gt_boxes, anchors_per_image)
+        match_quality_matrix = box_ops.box_iou(gt_boxes, anchors_per_image) #  self.box_similarity(gt_boxes, anchors_per_image)
 
         if match_quality_matrix.numel() == 0:
             # empty targets or proposals not supported during training
diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py
@@ -1,7 +1,10 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 import torch
+import torchvision
 from torch import nn
 
+from torch.jit.annotations import List, Optional, Dict
+
 
 class AnchorGenerator(nn.Module):
     __annotations__ = {
diff --git a/torchvision/models/detection/backbone_utils.py b/torchvision/models/detection/backbone_utils.py
@@ -25,7 +25,6 @@ class BackboneWithFPN(nn.Module):
     Attributes:
         out_channels (int): the number of channels in the FPN
     """
-    def __init__(self, backbone, return_layers, in_channels_list, out_channels):
     def __init__(self, backbone, return_layers, in_channels_list, out_channels, extra_blocks=LastLevelMaxPool()):
         super(BackboneWithFPN, self).__init__()
         self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
@@ -1,8 +1,10 @@
+import math
 from collections import OrderedDict
 
 import torch
 from torch import nn
 import torch.nn.functional as F
+from torch.jit.annotations import Dict, List, Tuple
 
 from ..utils import load_state_dict_from_url
 
@@ -36,13 +38,62 @@ def __init__(self, in_channels, num_anchors, num_classes):
     def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
         return {
             'classification': self.classification_head.compute_loss(targets, head_outputs, anchors, matched_idxs),
-            'bbox_reg': self.regression_head.compute_loss(targets, head_outputs, anchors, matched_idxs),
+            'bbox_regression': self.regression_head.compute_loss(targets, head_outputs, anchors, matched_idxs),
         }
 
     def forward(self, x):
-        logits = [self.classification_head(feature) for feature in x]
+        cls_logits = [self.classification_head(feature) for feature in x]
         bbox_reg = [self.regression_head(feature) for feature in x]
-        return dict(logits=logits, bbox_reg=bbox_reg)
+        return dict(cls_logits=cls_logits, bbox_reg=bbox_reg)
+
+
+def sigmoid_focal_loss(
+    inputs,
+    targets,
+    alpha: float = 0.25,
+    gamma: float = 2,
+    reduction: str = "none",
+):
+    """
+    Original implementation from https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/focal_loss.py .
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples or -1 for ignore. Default = 0.25
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+        reduction: 'none' | 'mean' | 'sum'
+                 'none': No reduction will be applied to the output.
+                 'mean': The output will be averaged.
+                 'sum': The output will be summed.
+    Returns:
+        Loss tensor with the reduction option applied.
+    """
+    p = torch.sigmoid(inputs)
+    ce_loss = F.binary_cross_entropy_with_logits(
+        inputs, targets, reduction="none"
+    )
+    p_t = p * targets + (1 - p) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    if reduction == "mean":
+        loss = loss.mean()
+    elif reduction == "sum":
+        loss = loss.sum()
+
+    return loss
+
+
+sigmoid_focal_loss_jit = torch.jit.script(sigmoid_focal_loss)
 
 
 class RetinaNetClassificationHead(nn.Module):
@@ -55,21 +106,59 @@ class RetinaNetClassificationHead(nn.Module):
         num_classes (int): number of classes to be predicted
     """
 
-    def __init__(self, in_channels, num_anchors, num_classes):
+    def __init__(self, in_channels, num_anchors, num_classes, prior_probability=0.01):
         super(RetinaNetClassificationHead, self).__init__()
         self.conv1 = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
         self.conv2 = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
         self.conv3 = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
         self.conv4 = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
-        self.cls_logits = nn.Conv2d(in_channels, num_anchors * num_classes, kernel_size=3, stride=1)
 
         for l in self.children():
             torch.nn.init.normal_(l.weight, std=0.01)
             torch.nn.init.constant_(l.bias, 0)
 
+        self.cls_logits = nn.Conv2d(in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1)
+        torch.nn.init.normal_(self.cls_logits.weight, std=0.01)
+        torch.nn.init.constant_(self.cls_logits.bias, -math.log((1 - prior_probability) / prior_probability))
+
+        self.num_classes = num_classes
+        self.num_anchors = num_anchors
+
     def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
-        # TODO Implement focal loss, is there an existing function for this?
-        return 0
+        loss = []
+
+        def permute_classification(tensor):
+            """ Permute classification output from (N, A * K, H, W) to (N, HWA, K). """
+            N, _, H, W = tensor.shape
+            tensor = tensor.view(N, -1, self.num_classes, H, W)
+            tensor = tensor.permute(0, 3, 4, 1, 2)
+            tensor = tensor.reshape(N, -1, self.num_classes)  # Size=(N, HWA, 4)
+            return tensor
+
+        predicted_classification = head_outputs['cls_logits']
+        predicted_classification = [permute_classification(cls) for cls in predicted_classification]
+        predicted_classification = torch.cat(predicted_classification, dim=1)
+
+        for targets_per_image, predicted_classification_per_image, anchors_per_image, matched_idxs_per_image in zip(targets, predicted_classification, anchors, matched_idxs):
+            # determine only the foreground
+            foreground_idxs_per_image = matched_idxs_per_image >= 0
+            num_foreground = foreground_idxs_per_image.sum()
+
+            # create the target classification
+            gt_classes_target = torch.zeros_like(predicted_classification_per_image)
+            gt_classes_target[foreground_idxs_per_image, targets_per_image['labels'][matched_idxs_per_image[foreground_idxs_per_image]]] = 1
+
+            # find indices for which anchors should be ignored
+            valid_idxs_per_image = matched_idxs_per_image != det_utils.Matcher.BETWEEN_THRESHOLDS
+
+            # compute the classification loss
+            loss.append(sigmoid_focal_loss_jit(
+                predicted_classification_per_image[valid_idxs_per_image],
+                gt_classes_target[valid_idxs_per_image],
+                reduction='sum',
+            ) / max(1, num_foreground))
+
+        return sum(loss) / len(loss)
 
     def forward(self, x):
         x = F.relu(self.conv1(x))
@@ -94,18 +183,29 @@ def __init__(self, in_channels, num_anchors):
         self.conv2 = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
         self.conv3 = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
         self.conv4 = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
-        self.bbox_reg = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1)
+        self.bbox_reg = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1)
 
         for l in self.children():
             torch.nn.init.normal_(l.weight, std=0.01)
-            torch.nn.init.constant_(l.bias, 0)
+            torch.nn.init.zeros_(l.bias)
 
         self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
 
     def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
         loss = []
 
-        predicted_regression = head_outputs['bbox_reg'][0]
+        def permute_bbox_reg(tensor):
+            """ Permute bbox regression output from (N, 4 * A, H, W) to (N, HWA, 4). """
+            N, _, H, W = tensor.shape
+            tensor = tensor.view(N, -1, 4, H, W)
+            tensor = tensor.permute(0, 3, 4, 1, 2)
+            tensor = tensor.reshape(N, -1, 4)  # Size=(N, HWA, 4)
+            return tensor
+
+        predicted_regression = head_outputs['bbox_reg']
+        predicted_regression = [permute_bbox_reg(reg) for reg in predicted_regression]
+        predicted_regression = torch.cat(predicted_regression, dim=1)
+
         for targets_per_image, predicted_regression_per_image, anchors_per_image, matched_idxs_per_image in zip(targets, predicted_regression, anchors, matched_idxs):
             # get the targets corresponding GT for each proposal
             # NB: need to clamp the indices because we can have a single
@@ -115,20 +215,20 @@ def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
 
             # determine only the foreground indices, ignore the rest
             foreground_idxs_per_image = matched_idxs_per_image >= 0
+            num_foreground = foreground_idxs_per_image.sum()
 
             # select only the foreground boxes
             matched_gt_boxes_per_image = matched_gt_boxes_per_image[foreground_idxs_per_image, :]
-            print(predicted_regression_per_image.shape)
-            predicted_regression_per_image = predicted_regression_per_image['bbox_reg'][foreground_idxs_per_image, :]
+            predicted_regression_per_image = predicted_regression_per_image[foreground_idxs_per_image, :]
             anchors_per_image = anchors_per_image[foreground_idxs_per_image, :]
 
             # compute the regression targets
-            target_regression = self.box_coder.encode(matched_gt_boxes_per_image, anchors_per_image)
+            target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image)
 
             # compute the loss
-            loss.append(torch.nn.SmoothL1Loss()(predicted_regression_per_image, target_regression))
+            loss.append(torch.nn.SmoothL1Loss(reduction='sum')(predicted_regression_per_image, target_regression) / max(1, num_foreground))
 
-        return sum(loss) / len(loss)
+        return sum(loss) / max(1, len(loss))
 
     def forward(self, x):
         x = F.relu(self.conv1(x))
@@ -251,7 +351,7 @@ def __init__(self, backbone, num_classes,
         self.anchor_generator = anchor_generator
 
         if head is None:
-            head = RetinaNetHead(backbone.out_channels, num_classes, anchor_generator.num_anchors_per_location()[0])
+            head = RetinaNetHead(backbone.out_channels, anchor_generator.num_anchors_per_location()[0], num_classes)
         self.head = head
 
         if proposal_matcher is None: