Add support to MViT v1 (#6179)

datumbox · web-flow · commit 3e7683f8a462 · 2022-06-23T15:10:52.000+01:00
* Switch implementation to v1 variant.

* Fix docs

* Adding back a v2 pseudovariant

* Changing the way the network are configured.

* Temporarily removing v2

* Adding weights.

* Expand _squeeze/_unsqueeze to support arbitrary dims.

* Update references script.

* Fix tests.

* Fixing frames and preprocessing.

* Fix std/mean values in transforms.

* Add permanent Dropout and update the weights.

* Update accuracies.
diff --git a/docs/source/models.rst b/docs/source/models.rst
@@ -465,7 +465,7 @@ pre-trained weights:
 .. toctree::
    :maxdepth: 1
 
-   models/video_mvitv2
+   models/video_mvit
    models/video_resnet
 
 |
diff --git a/docs/source/models/video_mvit.rst b/docs/source/models/video_mvit.rst
@@ -12,17 +12,15 @@ The MViT V2 model is based on the
 Model builders
 --------------
 
-The following model builders can be used to instantiate a MViTV2 model, with or
+The following model builders can be used to instantiate a MViT model, with or
 without pre-trained weights. All the model builders internally rely on the
-``torchvision.models.video.MViTV2`` base class. Please refer to the `source
+``torchvision.models.video.MViT`` base class. Please refer to the `source
 code
-<https://github.com/pytorch/vision/blob/main/torchvision/models/video/mvitv2.py>`_ for
+<https://github.com/pytorch/vision/blob/main/torchvision/models/video/mvit.py>`_ for
 more details about this class.
 
 .. autosummary::
     :toctree: generated/
     :template: function.rst
 
-    mvit_v2_t
-    mvit_v2_s
-    mvit_v2_b
+    mvit_v1_b
diff --git a/references/video_classification/train.py b/references/video_classification/train.py
@@ -152,7 +152,7 @@ def main(args):
             split="train",
             step_between_clips=1,
             transform=transform_train,
-            frame_rate=15,
+            frame_rate=args.frame_rate,
             extensions=(
                 "avi",
                 "mp4",
@@ -189,7 +189,7 @@ def main(args):
             split="val",
             step_between_clips=1,
             transform=transform_test,
-            frame_rate=15,
+            frame_rate=args.frame_rate,
             extensions=(
                 "avi",
                 "mp4",
@@ -324,6 +324,7 @@ def parse_args():
     parser.add_argument("--model", default="r2plus1d_18", type=str, help="model name")
     parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
     parser.add_argument("--clip-len", default=16, type=int, metavar="N", help="number of frames per clip")
+    parser.add_argument("--frame-rate", default=15, type=int, metavar="N", help="the frame rate")
     parser.add_argument(
         "--clips-per-video", default=5, type=int, metavar="N", help="maximum number of clips per video to consider"
     )
diff --git a/test/expect/ModelTester.test_mvit_v1_b_expect.pkl b/test/expect/ModelTester.test_mvit_v1_b_expect.pkl
diff --git a/test/expect/ModelTester.test_mvit_v2_t_expect.pkl b/test/expect/ModelTester.test_mvit_v2_t_expect.pkl
diff --git a/test/test_extended_models.py b/test/test_extended_models.py
@@ -181,7 +181,7 @@ def test_transforms_jit(model_fn):
             "input_shape": (1, 3, 520, 520),
         },
         "video": {
-            "input_shape": (1, 4, 3, 112, 112),
+            "input_shape": (1, 3, 4, 112, 112),
         },
         "optical_flow": {
             "input_shape": (1, 3, 128, 128),
@@ -195,6 +195,8 @@ def test_transforms_jit(model_fn):
     if module_name == "optical_flow":
         args = (x, x)
     else:
+        if module_name == "video":
+            x = x.permute(0, 2, 1, 3, 4)
         args = (x,)
 
     problematic_weights = []
diff --git a/test/test_models.py b/test/test_models.py
@@ -309,15 +309,9 @@ def _check_input_backprop(model, inputs):
         "image_size": 56,
         "input_shape": (1, 3, 56, 56),
     },
-    "mvit_v2_t": {
+    "mvit_v1_b": {
         "input_shape": (1, 3, 16, 224, 224),
     },
-    "mvit_v2_s": {
-        "input_shape": (1, 3, 16, 224, 224),
-    },
-    "mvit_v2_b": {
-        "input_shape": (1, 3, 32, 224, 224),
-    },
 }
 # speeding up slow models:
 slow_models = [
@@ -347,7 +341,6 @@ def _check_input_backprop(model, inputs):
 skipped_big_models = {
     "vit_h_14",
     "regnet_y_128gf",
-    "mvit_v2_b",
 }
 
 # The following contains configuration and expected values to be used tests that are model specific
diff --git a/torchvision/models/video/__init__.py b/torchvision/models/video/__init__.py
@@ -1,2 +1,2 @@
-from .mvitv2 import *
+from .mvit import *
 from .resnet import *
diff --git a/torchvision/models/video/mvit.py b/torchvision/models/video/mvit.py

Original file line number	Diff line number	Diff line change
`@@ -465,7 +465,7 @@ pre-trained weights:`
`465`	`465`	`.. toctree::`
`466`	`466`	`:maxdepth: 1`
`467`	`467`
`468`		`- models/video_mvitv2`
	`468`	`+ models/video_mvit`
`469`	`469`	`models/video_resnet`
`470`	`470`
`471`	`471`	`\|`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-from .mvitv2 import *`
	`1`	`+from .mvit import *`
`2`	`2`	`from .resnet import *`