Revert vit_h_14 as it breaks our CI (#5259)

datumbox · web-flow · commit c6722307e686 · 2022-01-23T10:58:26.000Z
diff --git a/docs/source/models.rst b/docs/source/models.rst
@@ -89,7 +89,6 @@ You can construct a model with random weights by calling its constructor:
     vit_b_32 = models.vit_b_32()
     vit_l_16 = models.vit_l_16()
     vit_l_32 = models.vit_l_32()
-    vit_h_14 = models.vit_h_14()
 
 We provide pre-trained models, using the PyTorch :mod:`torch.utils.model_zoo`.
 These can be constructed by passing ``pretrained=True``:
@@ -464,7 +463,6 @@ VisionTransformer
     vit_b_32
     vit_l_16
     vit_l_32
-    vit_h_14
 
 Quantized Models
 ----------------
diff --git a/hubconf.py b/hubconf.py
@@ -63,5 +63,4 @@
     vit_b_32,
     vit_l_16,
     vit_l_32,
-    vit_h_14,
 )
diff --git a/test/expect/ModelTester.test_vit_h_14_expect.pkl b/test/expect/ModelTester.test_vit_h_14_expect.pkl
diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py
@@ -15,7 +15,6 @@
     "vit_b_32",
     "vit_l_16",
     "vit_l_32",
-    "vit_h_14",
 ]
 
 model_urls = {
@@ -357,26 +356,6 @@ def vit_l_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) ->
     )
 
 
-def vit_h_14(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer:
-    """
-    Constructs a vit_h_14 architecture from
-    `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" <https://arxiv.org/abs/2010.11929>`_.
-
-    NOTE: Pretrained weights are not available for this model.
-    """
-    return _vision_transformer(
-        arch="vit_h_14",
-        patch_size=14,
-        num_layers=32,
-        num_heads=16,
-        hidden_dim=1280,
-        mlp_dim=5120,
-        pretrained=pretrained,
-        progress=progress,
-        **kwargs,
-    )
-
-
 def interpolate_embeddings(
     image_size: int,
     patch_size: int,
diff --git a/torchvision/prototype/models/vision_transformer.py b/torchvision/prototype/models/vision_transformer.py
@@ -19,12 +19,10 @@
     "ViT_B_32_Weights",
     "ViT_L_16_Weights",
     "ViT_L_32_Weights",
-    "ViT_H_14_Weights",
     "vit_b_16",
     "vit_b_32",
     "vit_l_16",
     "vit_l_32",
-    "vit_h_14",
 ]
 
 
@@ -105,11 +103,6 @@ class ViT_L_32_Weights(WeightsEnum):
     default = ImageNet1K_V1
 
 
-class ViT_H_14_Weights(WeightsEnum):
-    # Weights are not available yet.
-    pass
-
-
 def _vision_transformer(
     patch_size: int,
     num_layers: int,
@@ -203,19 +196,3 @@ def vit_l_32(*, weights: Optional[ViT_L_32_Weights] = None, progress: bool = Tru
         progress=progress,
         **kwargs,
     )
-
-
-@handle_legacy_interface(weights=("pretrained", None))
-def vit_h_14(*, weights: Optional[ViT_H_14_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer:
-    weights = ViT_H_14_Weights.verify(weights)
-
-    return _vision_transformer(
-        patch_size=14,
-        num_layers=32,
-        num_heads=16,
-        hidden_dim=1280,
-        mlp_dim=5120,
-        weights=weights,
-        progress=progress,
-        **kwargs,
-    )

Original file line number	Diff line number	Diff line change
`@@ -63,5 +63,4 @@`
`63`	`63`	`vit_b_32,`
`64`	`64`	`vit_l_16,`
`65`	`65`	`vit_l_32,`
`66`		`- vit_h_14,`
`67`	`66`	`)`