pytorch
diff --git a/‎README.rst
Lines changed: 2 additions & 0 deletions b/‎README.rst
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/models.rst
Lines changed: 24 additions & 1 deletion b/‎docs/source/models.rst
Lines changed: 24 additions & 1 deletion
diff --git a/‎hubconf.py
Lines changed: 1 addition & 0 deletions b/‎hubconf.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎references/classification/README.md
Lines changed: 3 additions & 2 deletions b/‎references/classification/README.md
Lines changed: 3 additions & 2 deletions
diff --git a/‎references/classification/train.py
Lines changed: 1 addition & 1 deletion b/‎references/classification/train.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎references/classification/train_quantization.py
Lines changed: 15 additions & 7 deletions b/‎references/classification/train_quantization.py
Lines changed: 15 additions & 7 deletions
diff --git a/‎references/classification/utils.py
Lines changed: 1 addition & 1 deletion b/‎references/classification/utils.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎setup.py
Lines changed: 1 addition & 0 deletions b/‎setup.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/expect/ModelTester.test_convnext_base_expect.pkl
939 Bytes b/‎test/expect/ModelTester.test_convnext_base_expect.pkl
939 Bytes
diff --git a/‎test/expect/ModelTester.test_convnext_large_expect.pkl
939 Bytes b/‎test/expect/ModelTester.test_convnext_large_expect.pkl
939 Bytes
diff --git a/‎test/expect/ModelTester.test_convnext_small_expect.pkl
939 Bytes b/‎test/expect/ModelTester.test_convnext_small_expect.pkl
939 Bytes
diff --git a/‎test/expect/ModelTester.test_vitc_b_16_expect.pkl
939 Bytes b/‎test/expect/ModelTester.test_vitc_b_16_expect.pkl
939 Bytes
diff --git a/‎test/test_models.py
Lines changed: 31 additions & 1 deletion b/‎test/test_models.py
Lines changed: 31 additions & 1 deletion
diff --git a/‎test/test_utils.py
Lines changed: 25 additions & 12 deletions b/‎test/test_utils.py
Lines changed: 25 additions & 12 deletions
diff --git a/‎test/test_video_gpu_decoder.py
Lines changed: 42 additions & 30 deletions b/‎test/test_video_gpu_decoder.py
Lines changed: 42 additions & 30 deletions
diff --git a/‎torchvision/csrc/io/decoder/gpu/decoder.h
Lines changed: 0 additions & 10 deletions b/‎torchvision/csrc/io/decoder/gpu/decoder.h
Lines changed: 0 additions & 10 deletions
@@ -23,6 +23,8 @@ supported Python versions.
 +==========================+==========================+=================================+
 | ``main`` / ``nightly``   | ``main`` / ``nightly``   | ``>=3.7``, ``<=3.9``            |
 +--------------------------+--------------------------+---------------------------------+
+| ``1.10.2``               | ``0.11.3``               | ``>=3.6``, ``<=3.9``            |
++--------------------------+--------------------------+---------------------------------+
 | ``1.10.1``               | ``0.11.2``               | ``>=3.6``, ``<=3.9``            |
 +--------------------------+--------------------------+---------------------------------+
 | ``1.10.0``               | ``0.11.1``               | ``>=3.6``, ``<=3.9``            |
 
@@ -89,6 +89,10 @@ You can construct a model with random weights by calling its constructor:
     vit_b_32 = models.vit_b_32()
     vit_l_16 = models.vit_l_16()
     vit_l_32 = models.vit_l_32()
+    convnext_tiny = models.convnext_tiny()
+    convnext_small = models.convnext_small()
+    convnext_base = models.convnext_base()
+    convnext_large = models.convnext_large()
 
 We provide pre-trained models, using the PyTorch :mod:`torch.utils.model_zoo`.
 These can be constructed by passing ``pretrained=True``:
@@ -136,6 +140,10 @@ These can be constructed by passing ``pretrained=True``:
     vit_b_32 = models.vit_b_32(pretrained=True)
     vit_l_16 = models.vit_l_16(pretrained=True)
     vit_l_32 = models.vit_l_32(pretrained=True)
+    convnext_tiny = models.convnext_tiny(pretrained=True)
+    convnext_small = models.convnext_small(pretrained=True)
+    convnext_base = models.convnext_base(pretrained=True)
+    convnext_large = models.convnext_large(pretrained=True)
 
 Instancing a pre-trained model will download its weights to a cache directory.
 This directory can be set using the `TORCH_HOME` environment variable. See
@@ -248,7 +256,10 @@ vit_b_16                          81.072          95.318
 vit_b_32                          75.912          92.466
 vit_l_16                          79.662          94.638
 vit_l_32                          76.972          93.070
-convnext_tiny (prototype)         82.520          96.146
+convnext_tiny                     82.520          96.146
+convnext_small                    83.616          96.650
+convnext_base                     84.062          96.870
+convnext_large                    84.414          96.976
 ================================  =============   =============
 
 
@@ -464,6 +475,18 @@ VisionTransformer
     vit_l_16
     vit_l_32
 
+ConvNeXt
+--------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    convnext_tiny
+    convnext_small
+    convnext_base
+    convnext_large
+
 Quantized Models
 ----------------
 
 
@@ -2,6 +2,7 @@
 dependencies = ["torch"]
 
 from torchvision.models.alexnet import alexnet
+from torchvision.models.convnext import convnext_tiny, convnext_small, convnext_base, convnext_large
 from torchvision.models.densenet import densenet121, densenet169, densenet201, densenet161
 from torchvision.models.efficientnet import (
     efficientnet_b0,
 
@@ -201,11 +201,12 @@ and `--batch_size 64`.
 ### ConvNeXt
 ```
 torchrun --nproc_per_node=8 train.py\ 
---model convnext_tiny --batch-size 128 --opt adamw --lr 1e-3 --lr-scheduler cosineannealinglr \ 
+--model $MODEL --batch-size 128 --opt adamw --lr 1e-3 --lr-scheduler cosineannealinglr \ 
 --lr-warmup-epochs 5 --lr-warmup-method linear --auto-augment ta_wide --epochs 600 --random-erase 0.1 \ 
 --label-smoothing 0.1 --mixup-alpha 0.2 --cutmix-alpha 1.0 --weight-decay 0.05 --norm-weight-decay 0.0 \
---train-crop-size 176 --model-ema --val-resize-size 236 --ra-sampler --ra-reps 4
+--train-crop-size 176 --model-ema --val-resize-size 232 --ra-sampler --ra-reps 4
 ```
+Here `$MODEL` is one of `convnext_tiny`, `convnext_small`, `convnext_base` and `convnext_large`. Note that each variant had its `--val-resize-size` optimized in a post-training step, see their `Weights` entry for their exact value.
 
 Note that the above command corresponds to training on a single node with 8 GPUs.
 For generatring the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs),
 
@@ -178,7 +178,7 @@ def load_data(traindir, valdir, args):
 
     print("Creating data loaders")
     if args.distributed:
-        if args.ra_sampler:
+        if hasattr(args, "ra_sampler") and args.ra_sampler:
             train_sampler = RASampler(dataset, shuffle=True, repetitions=args.ra_reps)
         else:
             train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
 
@@ -13,14 +13,16 @@
 
 
 try:
-    from torchvision.prototype import models as PM
+    from torchvision import prototype
 except ImportError:
-    PM = None
+    prototype = None
 
 
 def main(args):
-    if args.weights and PM is None:
+    if args.prototype and prototype is None:
         raise ImportError("The prototype module couldn't be found. Please install the latest torchvision nightly.")
+    if not args.prototype and args.weights:
+        raise ValueError("The weights parameter works only in prototype mode. Please pass the --prototype argument.")
     if args.output_dir:
         utils.mkdir(args.output_dir)
 
@@ -54,14 +56,14 @@ def main(args):
 
     print("Creating model", args.model)
     # when training quantized models, we always start from a pre-trained fp32 reference model
-    if not args.weights:
+    if not args.prototype:
         model = torchvision.models.quantization.__dict__[args.model](pretrained=True, quantize=args.test_only)
     else:
-        model = PM.quantization.__dict__[args.model](weights=args.weights, quantize=args.test_only)
+        model = prototype.models.quantization.__dict__[args.model](weights=args.weights, quantize=args.test_only)
     model.to(device)
 
     if not (args.test_only or args.post_training_quantize):
-        model.fuse_model()
+        model.fuse_model(is_qat=True)
         model.qconfig = torch.ao.quantization.get_default_qat_qconfig(args.backend)
         torch.ao.quantization.prepare_qat(model, inplace=True)
 
@@ -95,7 +97,7 @@ def main(args):
             ds, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True
         )
         model.eval()
-        model.fuse_model()
+        model.fuse_model(is_qat=False)
         model.qconfig = torch.ao.quantization.get_default_qconfig(args.backend)
         torch.ao.quantization.prepare(model, inplace=True)
         # Calibrate first
@@ -264,6 +266,12 @@ def get_args_parser(add_help=True):
     parser.add_argument("--clip-grad-norm", default=None, type=float, help="the maximum gradient norm (default None)")
 
     # Prototype models only
+    parser.add_argument(
+        "--prototype",
+        dest="prototype",
+        help="Use prototype model builders instead those from main area",
+        action="store_true",
+    )
     parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load")
 
     return parser
 
@@ -344,7 +344,7 @@ def store_model_weights(model, checkpoint_path, checkpoint_key="model", strict=T
 
         # Quantized Classification
         model = M.quantization.mobilenet_v3_large(pretrained=False, quantize=False)
-        model.fuse_model()
+        model.fuse_model(is_qat=True)
         model.qconfig = torch.ao.quantization.get_default_qat_qconfig('qnnpack')
         _ = torch.ao.quantization.prepare_qat(model, inplace=True)
         print(store_model_weights(model, './qat.pth'))
 
@@ -58,6 +58,7 @@ def write_version_file():
     pytorch_dep += "==" + os.getenv("PYTORCH_VERSION")
 
 requirements = [
+    "typing_extensions",
     "numpy",
     "requests",
     pytorch_dep,
 
@@ -8,6 +8,7 @@
 import warnings
 from collections import OrderedDict
 from tempfile import TemporaryDirectory
+from typing import Any
 
 import pytest
 import torch
@@ -514,6 +515,35 @@ def test_generalizedrcnn_transform_repr():
     assert t.__repr__() == expected_string
 
 
+test_vit_conv_stem_configs = [
+    models.vision_transformer.ConvStemConfig(kernel_size=3, stride=2, out_channels=64),
+    models.vision_transformer.ConvStemConfig(kernel_size=3, stride=2, out_channels=128),
+    models.vision_transformer.ConvStemConfig(kernel_size=3, stride=1, out_channels=128),
+    models.vision_transformer.ConvStemConfig(kernel_size=3, stride=2, out_channels=256),
+    models.vision_transformer.ConvStemConfig(kernel_size=3, stride=1, out_channels=256),
+    models.vision_transformer.ConvStemConfig(kernel_size=3, stride=2, out_channels=512),
+]
+
+
+def vitc_b_16(**kwargs: Any):
+    return models.VisionTransformer(
+        image_size=224,
+        patch_size=16,
+        num_layers=12,
+        num_heads=12,
+        hidden_dim=768,
+        mlp_dim=3072,
+        conv_stem_configs=test_vit_conv_stem_configs,
+        **kwargs,
+    )
+
+
+@pytest.mark.parametrize("model_fn", [vitc_b_16])
+@pytest.mark.parametrize("dev", cpu_and_gpu())
+def test_vitc_models(model_fn, dev):
+    test_classification_model(model_fn, dev)
+
+
 @pytest.mark.parametrize("model_fn", get_models_from_module(models))
 @pytest.mark.parametrize("dev", cpu_and_gpu())
 def test_classification_model(model_fn, dev):
@@ -803,7 +833,7 @@ def test_quantized_classification_model(model_fn):
             model.train()
             model.qconfig = torch.ao.quantization.default_qat_qconfig
 
-        model.fuse_model()
+        model.fuse_model(is_qat=not eval_mode)
         if eval_mode:
             torch.ao.quantization.prepare(model, inplace=True)
         else:
 
@@ -317,29 +317,42 @@ def test_draw_keypoints_errors():
         utils.draw_keypoints(image=img, keypoints=invalid_keypoints)
 
 
-def test_flow_to_image():
+@pytest.mark.parametrize("batch", (True, False))
+def test_flow_to_image(batch):
     h, w = 100, 100
     flow = torch.meshgrid(torch.arange(h), torch.arange(w), indexing="ij")
     flow = torch.stack(flow[::-1], dim=0).float()
     flow[0] -= h / 2
     flow[1] -= w / 2
+
+    if batch:
+        flow = torch.stack([flow, flow])
+
     img = utils.flow_to_image(flow)
+    assert img.shape == (2, 3, h, w) if batch else (3, h, w)
+
     path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "expected_flow.pt")
     expected_img = torch.load(path, map_location="cpu")
-    assert_equal(expected_img, img)
 
+    if batch:
+        expected_img = torch.stack([expected_img, expected_img])
+
+    assert_equal(expected_img, img)
 
-def test_flow_to_image_errors():
-    wrong_flow1 = torch.full((3, 10, 10), 0, dtype=torch.float)
-    wrong_flow2 = torch.full((2, 10), 0, dtype=torch.float)
-    wrong_flow3 = torch.full((2, 10, 30), 0, dtype=torch.int)
 
-    with pytest.raises(ValueError, match="Input flow should have shape"):
-        utils.flow_to_image(flow=wrong_flow1)
-    with pytest.raises(ValueError, match="Input flow should have shape"):
-        utils.flow_to_image(flow=wrong_flow2)
-    with pytest.raises(ValueError, match="Flow should be of dtype torch.float"):
-        utils.flow_to_image(flow=wrong_flow3)
+@pytest.mark.parametrize(
+    "input_flow, match",
+    (
+        (torch.full((3, 10, 10), 0, dtype=torch.float), "Input flow should have shape"),
+        (torch.full((5, 3, 10, 10), 0, dtype=torch.float), "Input flow should have shape"),
+        (torch.full((2, 10), 0, dtype=torch.float), "Input flow should have shape"),
+        (torch.full((5, 2, 10), 0, dtype=torch.float), "Input flow should have shape"),
+        (torch.full((2, 10, 30), 0, dtype=torch.int), "Flow should be of dtype torch.float"),
+    ),
+)
+def test_flow_to_image_errors(input_flow, match):
+    with pytest.raises(ValueError, match=match):
+        utils.flow_to_image(flow=input_flow)
 
 
 if __name__ == "__main__":
 
@@ -12,30 +12,31 @@
 
 VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos")
 
-test_videos = [
-    "RATRACE_wave_f_nm_np1_fr_goo_37.avi",
-    "TrumanShow_wave_f_nm_np1_fr_med_26.avi",
-    "v_SoccerJuggling_g23_c01.avi",
-    "v_SoccerJuggling_g24_c01.avi",
-    "R6llTwEh07w.mp4",
-    "SOX5yA1l24A.mp4",
-    "WUzgd7C1pWA.mp4",
-]
-
 
 @pytest.mark.skipif(_HAS_VIDEO_DECODER is False, reason="Didn't compile with support for gpu decoder")
 class TestVideoGPUDecoder:
     @pytest.mark.skipif(av is None, reason="PyAV unavailable")
-    def test_frame_reading(self):
-        for test_video in test_videos:
-            full_path = os.path.join(VIDEO_DIR, test_video)
-            decoder = VideoReader(full_path, device="cuda:0")
-            with av.open(full_path) as container:
-                for av_frame in container.decode(container.streams.video[0]):
-                    av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray())
-                    vision_frames = next(decoder)["data"]
-                    mean_delta = torch.mean(torch.abs(av_frames.float() - vision_frames.cpu().float()))
-                    assert mean_delta < 0.75
+    @pytest.mark.parametrize(
+        "video_file",
+        [
+            "RATRACE_wave_f_nm_np1_fr_goo_37.avi",
+            "TrumanShow_wave_f_nm_np1_fr_med_26.avi",
+            "v_SoccerJuggling_g23_c01.avi",
+            "v_SoccerJuggling_g24_c01.avi",
+            "R6llTwEh07w.mp4",
+            "SOX5yA1l24A.mp4",
+            "WUzgd7C1pWA.mp4",
+        ],
+    )
+    def test_frame_reading(self, video_file):
+        full_path = os.path.join(VIDEO_DIR, video_file)
+        decoder = VideoReader(full_path, device="cuda:0")
+        with av.open(full_path) as container:
+            for av_frame in container.decode(container.streams.video[0]):
+                av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray())
+                vision_frames = next(decoder)["data"]
+                mean_delta = torch.mean(torch.abs(av_frames.float() - vision_frames.cpu().float()))
+                assert mean_delta < 0.75
 
     @pytest.mark.skipif(av is None, reason="PyAV unavailable")
     @pytest.mark.parametrize("keyframes", [True, False])
@@ -65,16 +66,27 @@ def test_seek_reading(self, keyframes, full_path, duration):
                 assert mean_delta < 0.75
 
     @pytest.mark.skipif(av is None, reason="PyAV unavailable")
-    def test_metadata(self):
-        for test_video in test_videos:
-            full_path = os.path.join(VIDEO_DIR, test_video)
-            decoder = VideoReader(full_path, device="cuda:0")
-            video_metadata = decoder.get_metadata()["video"]
-            with av.open(full_path) as container:
-                video = container.streams.video[0]
-                av_duration = float(video.duration * video.time_base)
-                assert math.isclose(video_metadata["duration"], av_duration, rel_tol=1e-2)
-                assert math.isclose(video_metadata["fps"], video.base_rate, rel_tol=1e-2)
+    @pytest.mark.parametrize(
+        "video_file",
+        [
+            "RATRACE_wave_f_nm_np1_fr_goo_37.avi",
+            "TrumanShow_wave_f_nm_np1_fr_med_26.avi",
+            "v_SoccerJuggling_g23_c01.avi",
+            "v_SoccerJuggling_g24_c01.avi",
+            "R6llTwEh07w.mp4",
+            "SOX5yA1l24A.mp4",
+            "WUzgd7C1pWA.mp4",
+        ],
+    )
+    def test_metadata(self, video_file):
+        full_path = os.path.join(VIDEO_DIR, video_file)
+        decoder = VideoReader(full_path, device="cuda:0")
+        video_metadata = decoder.get_metadata()["video"]
+        with av.open(full_path) as container:
+            video = container.streams.video[0]
+            av_duration = float(video.duration * video.time_base)
+            assert math.isclose(video_metadata["duration"], av_duration, rel_tol=1e-2)
+            assert math.isclose(video_metadata["fps"], video.base_rate, rel_tol=1e-2)
 
 
 if __name__ == "__main__":
 
@@ -36,16 +36,6 @@ class Decoder {
   void release();
   void decode(const uint8_t*, unsigned long);
   torch::Tensor fetch_frame();
-  int get_frame_size() const {
-    return get_width() * (luma_height + (chroma_height * num_chroma_planes)) *
-        bytes_per_pixel;
-  }
-  int get_width() const {
-    return (video_output_format == cudaVideoSurfaceFormat_NV12 ||
-            video_output_format == cudaVideoSurfaceFormat_P016)
-        ? (width + 1) & ~1
-        : width;
-  }
   int get_height() const {
     return luma_height;
   }