Add Lite R-ASPP with MobileNetV3 backbone.

datumbox · datumbox · commit 231a525386b5 · 2021-01-25T20:33:25.000Z
diff --git a/test/expect/ModelTester.test_lraspp_mobilenet_v3_large_expect.pkl b/test/expect/ModelTester.test_lraspp_mobilenet_v3_large_expect.pkl
diff --git a/test/test_models.py b/test/test_models.py
@@ -65,6 +65,7 @@ def get_available_video_models():
     "fcn_resnet50",
     "fcn_resnet101",
     "fcn_mobilenet_v3_large",
+    "lraspp_mobilenet_v3_large",
 )
 
 
diff --git a/torchvision/models/segmentation/lraspp.py b/torchvision/models/segmentation/lraspp.py
@@ -0,0 +1,48 @@
+from collections import OrderedDict
+
+from torch import nn
+from torch.nn import functional as F
+
+
+__all__ = ["LRASPP"]
+
+
+class LRASPP(nn.Module):
+
+    def __init__(self, backbone, s8_channels, s16_channels, num_classes, inter_channels=128):
+        super().__init__()
+        self.backbone = backbone
+
+        self.cbr = nn.Sequential(
+            nn.Conv2d(s16_channels, inter_channels, 1, bias=False),
+            nn.BatchNorm2d(inter_channels),
+            nn.ReLU(inplace=True)
+        )
+        self.scale = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(s16_channels, inter_channels, 1, bias=False),
+            nn.Sigmoid(),
+        )
+
+        self.s8_classifier = nn.Conv2d(s8_channels, num_classes, 1)
+        self.s16_classifier = nn.Conv2d(inter_channels, num_classes, 1)
+
+    def forward(self, input):
+        input_shape = input.shape[-2:]
+        features = self.backbone(input)
+
+        s8 = features["s8"]
+        s16 = features["s16"]
+
+        x = self.cbr(s16)
+        s = self.scale(s16)
+        x = x * s
+        x = F.interpolate(x, size=s8.shape[-2:], mode='bilinear', align_corners=False)
+
+        out = self.s8_classifier(s8) + self.s16_classifier(x)
+        out = F.interpolate(out, size=input_shape, mode='bilinear', align_corners=False)
+
+        result = OrderedDict()
+        result["out"] = out
+
+        return result
diff --git a/torchvision/models/segmentation/segmentation.py b/torchvision/models/segmentation/segmentation.py
@@ -4,10 +4,11 @@
 from .. import resnet
 from .deeplabv3 import DeepLabHead, DeepLabV3
 from .fcn import FCN, FCNHead
+from .lraspp import LRASPP
 
 
 __all__ = ['fcn_resnet50', 'fcn_resnet101', 'fcn_mobilenet_v3_large', 'deeplabv3_resnet50', 'deeplabv3_resnet101',
-           'deeplabv3_mobilenet_v3_large']
+           'deeplabv3_mobilenet_v3_large', 'lraspp_mobilenet_v3_large']
 
 
 model_urls = {
@@ -17,6 +18,7 @@
     'deeplabv3_resnet50_coco': 'https://download.pytorch.org/models/deeplabv3_resnet50_coco-cd0a2569.pth',
     'deeplabv3_resnet101_coco': 'https://download.pytorch.org/models/deeplabv3_resnet101_coco-586e9e4e.pth',
     'deeplabv3_mobilenet_v3_large_coco': None,
+    'lraspp_mobilenet_v3_large_coco': None,
 }
 
 
@@ -64,18 +66,39 @@ def _segm_model(name, backbone_name, num_classes, aux, pretrained_backbone=True)
     return model
 
 
+def _segm_mobilenetv3(backbone_name, num_classes, pretrained_backbone=True):
+    backbone = mobilenetv3.__dict__[backbone_name](pretrained=pretrained_backbone, _dilated=True).features
+
+    # Gather the indeces of blocks which are strided. These are the locations of C1, ..., Cn-1 blocks.
+    # The first and last blocks are always included because they are the C0 (conv1) and Cn.
+    stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
+    s8_pos = stage_indices[-4]  # use C2 here which has output_stride = 8
+    s16_pos = stage_indices[-1]  # use C5 which has output_stride = 16
+    s8_channels = backbone[s8_pos].out_channels
+    s16_channels = backbone[s16_pos].out_channels
+
+    backbone = IntermediateLayerGetter(backbone, return_layers={str(s8_pos): 's8', str(s16_pos): 's16'})
+
+    model = LRASPP(backbone, s8_channels, s16_channels, num_classes)
+    return model
+
+
+def _load_weights(model, arch_type, backbone, progress):
+    arch = arch_type + '_' + backbone + '_coco'
+    model_url = model_urls[arch]
+    if model_url is None:
+        raise NotImplementedError('pretrained {} is not supported as of now'.format(arch))
+    else:
+        state_dict = load_state_dict_from_url(model_url, progress=progress)
+        model.load_state_dict(state_dict)
+
+
 def _load_model(arch_type, backbone, pretrained, progress, num_classes, aux_loss, **kwargs):
     if pretrained:
         aux_loss = True
     model = _segm_model(arch_type, backbone, num_classes, aux_loss, **kwargs)
     if pretrained:
-        arch = arch_type + '_' + backbone + '_coco'
-        model_url = model_urls[arch]
-        if model_url is None:
-            raise NotImplementedError('pretrained {} is not supported as of now'.format(arch))
-        else:
-            state_dict = load_state_dict_from_url(model_url, progress=progress)
-            model.load_state_dict(state_dict)
+        _load_weights(model, arch_type, backbone, progress)
     return model
 
 
@@ -161,3 +184,24 @@ def deeplabv3_mobilenet_v3_large(pretrained=False, progress=True,
         aux_loss (bool): If True, it uses an auxiliary loss
     """
     return _load_model('deeplabv3', 'mobilenet_v3_large', pretrained, progress, num_classes, aux_loss, **kwargs)
+
+
+def lraspp_mobilenet_v3_large(pretrained=False, progress=True, num_classes=21, **kwargs):
+    """Constructs a Lite R-ASPP Network model with a MobileNetV3-Large backbone.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on COCO train2017 which
+            contains the same classes as Pascal VOC
+        progress (bool): If True, displays a progress bar of the download to stderr
+        num_classes (int): number of output classes of the model (including the background)
+    """
+    if kwargs.pop("aux_loss", False):
+        raise NotImplementedError('This model does not use auxiliary loss')
+
+    backbone_name = 'mobilenet_v3_large'
+    model = _segm_mobilenetv3(backbone_name, num_classes, **kwargs)
+
+    if pretrained:
+        _load_weights(model, 'lraspp', backbone_name, progress)
+
+    return model

Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,7 @@ def get_available_video_models():`
`65`	`65`	`"fcn_resnet50",`
`66`	`66`	`"fcn_resnet101",`
`67`	`67`	`"fcn_mobilenet_v3_large",`
	`68`	`+ "lraspp_mobilenet_v3_large",`
`68`	`69`	`)`
`69`	`70`
`70`	`71`