pytorch
diff --git a/‎test/expect/ModelTester.test_deeplabv3_mobilenet_v3_large_expect.pkl
0 Bytes b/‎test/expect/ModelTester.test_deeplabv3_mobilenet_v3_large_expect.pkl
0 Bytes
diff --git a/‎test/expect/ModelTester.test_fcn_mobilenet_v3_large_expect.pkl
0 Bytes b/‎test/expect/ModelTester.test_fcn_mobilenet_v3_large_expect.pkl
0 Bytes
diff --git a/‎torchvision/models/mobilenetv2.py
Lines changed: 4 additions & 2 deletions b/‎torchvision/models/mobilenetv2.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎torchvision/models/mobilenetv3.py
Lines changed: 41 additions & 43 deletions b/‎torchvision/models/mobilenetv3.py
Lines changed: 41 additions & 43 deletions
diff --git a/‎torchvision/models/segmentation/segmentation.py
Lines changed: 5 additions & 5 deletions b/‎torchvision/models/segmentation/segmentation.py
Lines changed: 5 additions & 5 deletions
@@ -38,14 +38,16 @@ def __init__(
         groups: int = 1,
         norm_layer: Optional[Callable[..., nn.Module]] = None,
         activation_layer: Optional[Callable[..., nn.Module]] = None,
+        dilation: int = 1,
     ) -> None:
-        padding = (kernel_size - 1) // 2
+        padding = (kernel_size - 1) // 2 * dilation
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
         if activation_layer is None:
             activation_layer = nn.ReLU6
         super(ConvBNReLU, self).__init__(
-            nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
+            nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, dilation=dilation, groups=groups,
+                      bias=False),
             norm_layer(out_planes),
             activation_layer(inplace=True)
         )
 
@@ -38,14 +38,15 @@ def forward(self, input: Tensor) -> Tensor:
 class InvertedResidualConfig:
 
     def __init__(self, input_channels: int, kernel: int, expanded_channels: int, out_channels: int, use_se: bool,
-                 activation: str, stride: int, width_mult: float):
+                 activation: str, stride: int, dilation: int, width_mult: float):
         self.input_channels = self.adjust_channels(input_channels, width_mult)
         self.kernel = kernel
         self.expanded_channels = self.adjust_channels(expanded_channels, width_mult)
         self.out_channels = self.adjust_channels(out_channels, width_mult)
         self.use_se = use_se
         self.use_hs = activation == "HS"
         self.stride = stride
+        self.dilation = dilation
 
     @staticmethod
     def adjust_channels(channels: int, width_mult: float):
@@ -70,9 +71,10 @@ def __init__(self, cnf: InvertedResidualConfig, norm_layer: Callable[..., nn.Mod
                                            norm_layer=norm_layer, activation_layer=activation_layer))
 
         # depthwise
+        stride = 1 if cnf.dilation > 1 else cnf.stride
         layers.append(ConvBNActivation(cnf.expanded_channels, cnf.expanded_channels, kernel_size=cnf.kernel,
-                                       stride=cnf.stride, groups=cnf.expanded_channels, norm_layer=norm_layer,
-                                       activation_layer=activation_layer))
+                                       stride=stride, dilation=cnf.dilation, groups=cnf.expanded_channels,
+                                       norm_layer=norm_layer, activation_layer=activation_layer))
         if cnf.use_se:
             layers.append(SqueezeExcitation(cnf.expanded_channels))
 
@@ -194,78 +196,74 @@ def _mobilenet_v3(
     return model
 
 
-def mobilenet_v3_large(pretrained: bool = False, progress: bool = True, reduced_tail: bool = False,
-                       **kwargs: Any) -> MobileNetV3:
+def mobilenet_v3_large(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> MobileNetV3:
     """
     Constructs a large MobileNetV3 architecture from
     `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
 
     Args:
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
-        reduced_tail (bool): If True, reduces the channel counts of all feature layers
-            between C4 and C5 by 2. It is used to reduce the channel redundancy in the
-            backbone for Detection and Segmentation.
     """
+    # non-public config parameters
+    reduce_divider = 2 if kwargs.pop('_reduced_tail', False) else 1
+    dilation = 2 if kwargs.pop('_dilated', False) else 1
     width_mult = 1.0
+
     bneck_conf = partial(InvertedResidualConfig, width_mult=width_mult)
     adjust_channels = partial(InvertedResidualConfig.adjust_channels, width_mult=width_mult)
 
-    reduce_divider = 2 if reduced_tail else 1
-
     inverted_residual_setting = [
-        bneck_conf(16, 3, 16, 16, False, "RE", 1),
-        bneck_conf(16, 3, 64, 24, False, "RE", 2),  # C1
-        bneck_conf(24, 3, 72, 24, False, "RE", 1),
-        bneck_conf(24, 5, 72, 40, True, "RE", 2),  # C2
-        bneck_conf(40, 5, 120, 40, True, "RE", 1),
-        bneck_conf(40, 5, 120, 40, True, "RE", 1),
-        bneck_conf(40, 3, 240, 80, False, "HS", 2),  # C3
-        bneck_conf(80, 3, 200, 80, False, "HS", 1),
-        bneck_conf(80, 3, 184, 80, False, "HS", 1),
-        bneck_conf(80, 3, 184, 80, False, "HS", 1),
-        bneck_conf(80, 3, 480, 112, True, "HS", 1),
-        bneck_conf(112, 3, 672, 112, True, "HS", 1),
-        bneck_conf(112, 5, 672, 160 // reduce_divider, True, "HS", 2),  # C4
-        bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1),
-        bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1),
+        bneck_conf(16, 3, 16, 16, False, "RE", 1, 1),
+        bneck_conf(16, 3, 64, 24, False, "RE", 2, 1),  # C1
+        bneck_conf(24, 3, 72, 24, False, "RE", 1, 1),
+        bneck_conf(24, 5, 72, 40, True, "RE", 2, 1),  # C2
+        bneck_conf(40, 5, 120, 40, True, "RE", 1, 1),
+        bneck_conf(40, 5, 120, 40, True, "RE", 1, 1),
+        bneck_conf(40, 3, 240, 80, False, "HS", 2, 1),  # C3
+        bneck_conf(80, 3, 200, 80, False, "HS", 1, 1),
+        bneck_conf(80, 3, 184, 80, False, "HS", 1, 1),
+        bneck_conf(80, 3, 184, 80, False, "HS", 1, 1),
+        bneck_conf(80, 3, 480, 112, True, "HS", 1, 1),
+        bneck_conf(112, 3, 672, 112, True, "HS", 1, 1),
+        bneck_conf(112, 5, 672, 160 // reduce_divider, True, "HS", 2, dilation),  # C4
+        bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1, dilation),
+        bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1, dilation),
     ]
     last_channel = adjust_channels(1280 // reduce_divider)  # C5
 
     return _mobilenet_v3("mobilenet_v3_large", inverted_residual_setting, last_channel, pretrained, progress, **kwargs)
 
 
-def mobilenet_v3_small(pretrained: bool = False, progress: bool = True, reduced_tail: bool = False,
-                       **kwargs: Any) -> MobileNetV3:
+def mobilenet_v3_small(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> MobileNetV3:
     """
     Constructs a small MobileNetV3 architecture from
     `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
 
     Args:
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
-        reduced_tail (bool): If True, reduces the channel counts of all feature layers
-            between C4 and C5 by 2. It is used to reduce the channel redundancy in the
-            backbone for Detection and Segmentation.
     """
+    # non-public config parameters
+    reduce_divider = 2 if kwargs.pop('_reduced_tail', False) else 1
+    dilation = 2 if kwargs.pop('_dilated', False) else 1
     width_mult = 1.0
+
     bneck_conf = partial(InvertedResidualConfig, width_mult=width_mult)
     adjust_channels = partial(InvertedResidualConfig.adjust_channels, width_mult=width_mult)
 
-    reduce_divider = 2 if reduced_tail else 1
-
     inverted_residual_setting = [
-        bneck_conf(16, 3, 16, 16, True, "RE", 2),  # C1
-        bneck_conf(16, 3, 72, 24, False, "RE", 2),  # C2
-        bneck_conf(24, 3, 88, 24, False, "RE", 1),
-        bneck_conf(24, 5, 96, 40, True, "HS", 2),  # C3
-        bneck_conf(40, 5, 240, 40, True, "HS", 1),
-        bneck_conf(40, 5, 240, 40, True, "HS", 1),
-        bneck_conf(40, 5, 120, 48, True, "HS", 1),
-        bneck_conf(48, 5, 144, 48, True, "HS", 1),
-        bneck_conf(48, 5, 288, 96 // reduce_divider, True, "HS", 2),  # C4
-        bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1),
-        bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1),
+        bneck_conf(16, 3, 16, 16, True, "RE", 2, 1),  # C1
+        bneck_conf(16, 3, 72, 24, False, "RE", 2, 1),  # C2
+        bneck_conf(24, 3, 88, 24, False, "RE", 1, 1),
+        bneck_conf(24, 5, 96, 40, True, "HS", 2, 1),  # C3
+        bneck_conf(40, 5, 240, 40, True, "HS", 1, 1),
+        bneck_conf(40, 5, 240, 40, True, "HS", 1, 1),
+        bneck_conf(40, 5, 120, 48, True, "HS", 1, 1),
+        bneck_conf(48, 5, 144, 48, True, "HS", 1, 1),
+        bneck_conf(48, 5, 288, 96 // reduce_divider, True, "HS", 2, dilation),  # C4
+        bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1, dilation),
+        bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1, dilation),
     ]
     last_channel = adjust_channels(1024 // reduce_divider)  # C5
 
 
@@ -1,6 +1,6 @@
 from .._utils import IntermediateLayerGetter
 from ..utils import load_state_dict_from_url
-from .. import mobilenet
+from .. import mobilenetv3
 from .. import resnet
 from .deeplabv3 import DeepLabHead, DeepLabV3
 from .fcn import FCN, FCNHead
@@ -29,16 +29,16 @@ def _segm_model(name, backbone_name, num_classes, aux, pretrained_backbone=True)
         out_inplanes = 2048
         aux_layer = 'layer3'
         aux_inplanes = 1024
-    elif 'mobilenet' in backbone_name:
-        backbone = mobilenet.__dict__[backbone_name](pretrained=pretrained_backbone).features
+    elif 'mobilenet_v3' in backbone_name:
+        backbone = mobilenetv3.__dict__[backbone_name](pretrained=pretrained_backbone, _dilated=True).features
 
         # Gather the indeces of blocks which are strided. These are the locations of C1, ..., Cn-1 blocks.
         # The first and last blocks are always included because they are the C0 (conv1) and Cn.
         stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
-        out_pos = stage_indices[-1]
+        out_pos = stage_indices[-1]  # use C5 which has output_stride = 16
         out_layer = str(out_pos)
         out_inplanes = backbone[out_pos].out_channels
-        aux_pos = stage_indices[-2]
+        aux_pos = stage_indices[-4]  # use C2 here which has output_stride = 8
         aux_layer = str(aux_pos)
         aux_inplanes = backbone[aux_pos].out_channels
     else: