Skip to content

Commit 6412fb0

Browse files
authored
Merge branch 'main' into fix-shear-autoaugment
2 parents 3d7b5d5 + 8a16e12 commit 6412fb0

35 files changed

+724
-338
lines changed

README.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ supported Python versions.
2323
+==========================+==========================+=================================+
2424
| ``main`` / ``nightly`` | ``main`` / ``nightly`` | ``>=3.7``, ``<=3.9`` |
2525
+--------------------------+--------------------------+---------------------------------+
26+
| ``1.10.2`` | ``0.11.3`` | ``>=3.6``, ``<=3.9`` |
27+
+--------------------------+--------------------------+---------------------------------+
2628
| ``1.10.1`` | ``0.11.2`` | ``>=3.6``, ``<=3.9`` |
2729
+--------------------------+--------------------------+---------------------------------+
2830
| ``1.10.0`` | ``0.11.1`` | ``>=3.6``, ``<=3.9`` |

docs/source/models.rst

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,10 @@ You can construct a model with random weights by calling its constructor:
8989
vit_b_32 = models.vit_b_32()
9090
vit_l_16 = models.vit_l_16()
9191
vit_l_32 = models.vit_l_32()
92+
convnext_tiny = models.convnext_tiny()
93+
convnext_small = models.convnext_small()
94+
convnext_base = models.convnext_base()
95+
convnext_large = models.convnext_large()
9296
9397
We provide pre-trained models, using the PyTorch :mod:`torch.utils.model_zoo`.
9498
These can be constructed by passing ``pretrained=True``:
@@ -136,6 +140,10 @@ These can be constructed by passing ``pretrained=True``:
136140
vit_b_32 = models.vit_b_32(pretrained=True)
137141
vit_l_16 = models.vit_l_16(pretrained=True)
138142
vit_l_32 = models.vit_l_32(pretrained=True)
143+
convnext_tiny = models.convnext_tiny(pretrained=True)
144+
convnext_small = models.convnext_small(pretrained=True)
145+
convnext_base = models.convnext_base(pretrained=True)
146+
convnext_large = models.convnext_large(pretrained=True)
139147
140148
Instancing a pre-trained model will download its weights to a cache directory.
141149
This directory can be set using the `TORCH_HOME` environment variable. See
@@ -248,7 +256,10 @@ vit_b_16 81.072 95.318
248256
vit_b_32 75.912 92.466
249257
vit_l_16 79.662 94.638
250258
vit_l_32 76.972 93.070
251-
convnext_tiny (prototype) 82.520 96.146
259+
convnext_tiny 82.520 96.146
260+
convnext_small 83.616 96.650
261+
convnext_base 84.062 96.870
262+
convnext_large 84.414 96.976
252263
================================ ============= =============
253264

254265

@@ -464,6 +475,18 @@ VisionTransformer
464475
vit_l_16
465476
vit_l_32
466477

478+
ConvNeXt
479+
--------
480+
481+
.. autosummary::
482+
:toctree: generated/
483+
:template: function.rst
484+
485+
convnext_tiny
486+
convnext_small
487+
convnext_base
488+
convnext_large
489+
467490
Quantized Models
468491
----------------
469492

hubconf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
dependencies = ["torch"]
33

44
from torchvision.models.alexnet import alexnet
5+
from torchvision.models.convnext import convnext_tiny, convnext_small, convnext_base, convnext_large
56
from torchvision.models.densenet import densenet121, densenet169, densenet201, densenet161
67
from torchvision.models.efficientnet import (
78
efficientnet_b0,

references/classification/README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,11 +201,12 @@ and `--batch_size 64`.
201201
### ConvNeXt
202202
```
203203
torchrun --nproc_per_node=8 train.py\
204-
--model convnext_tiny --batch-size 128 --opt adamw --lr 1e-3 --lr-scheduler cosineannealinglr \
204+
--model $MODEL --batch-size 128 --opt adamw --lr 1e-3 --lr-scheduler cosineannealinglr \
205205
--lr-warmup-epochs 5 --lr-warmup-method linear --auto-augment ta_wide --epochs 600 --random-erase 0.1 \
206206
--label-smoothing 0.1 --mixup-alpha 0.2 --cutmix-alpha 1.0 --weight-decay 0.05 --norm-weight-decay 0.0 \
207-
--train-crop-size 176 --model-ema --val-resize-size 236 --ra-sampler --ra-reps 4
207+
--train-crop-size 176 --model-ema --val-resize-size 232 --ra-sampler --ra-reps 4
208208
```
209+
Here `$MODEL` is one of `convnext_tiny`, `convnext_small`, `convnext_base` and `convnext_large`. Note that each variant had its `--val-resize-size` optimized in a post-training step, see their `Weights` entry for their exact value.
209210

210211
Note that the above command corresponds to training on a single node with 8 GPUs.
211212
For generatring the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs),

references/classification/train.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def load_data(traindir, valdir, args):
178178

179179
print("Creating data loaders")
180180
if args.distributed:
181-
if args.ra_sampler:
181+
if hasattr(args, "ra_sampler") and args.ra_sampler:
182182
train_sampler = RASampler(dataset, shuffle=True, repetitions=args.ra_reps)
183183
else:
184184
train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)

references/classification/train_quantization.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,16 @@
1313

1414

1515
try:
16-
from torchvision.prototype import models as PM
16+
from torchvision import prototype
1717
except ImportError:
18-
PM = None
18+
prototype = None
1919

2020

2121
def main(args):
22-
if args.weights and PM is None:
22+
if args.prototype and prototype is None:
2323
raise ImportError("The prototype module couldn't be found. Please install the latest torchvision nightly.")
24+
if not args.prototype and args.weights:
25+
raise ValueError("The weights parameter works only in prototype mode. Please pass the --prototype argument.")
2426
if args.output_dir:
2527
utils.mkdir(args.output_dir)
2628

@@ -54,14 +56,14 @@ def main(args):
5456

5557
print("Creating model", args.model)
5658
# when training quantized models, we always start from a pre-trained fp32 reference model
57-
if not args.weights:
59+
if not args.prototype:
5860
model = torchvision.models.quantization.__dict__[args.model](pretrained=True, quantize=args.test_only)
5961
else:
60-
model = PM.quantization.__dict__[args.model](weights=args.weights, quantize=args.test_only)
62+
model = prototype.models.quantization.__dict__[args.model](weights=args.weights, quantize=args.test_only)
6163
model.to(device)
6264

6365
if not (args.test_only or args.post_training_quantize):
64-
model.fuse_model()
66+
model.fuse_model(is_qat=True)
6567
model.qconfig = torch.ao.quantization.get_default_qat_qconfig(args.backend)
6668
torch.ao.quantization.prepare_qat(model, inplace=True)
6769

@@ -95,7 +97,7 @@ def main(args):
9597
ds, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True
9698
)
9799
model.eval()
98-
model.fuse_model()
100+
model.fuse_model(is_qat=False)
99101
model.qconfig = torch.ao.quantization.get_default_qconfig(args.backend)
100102
torch.ao.quantization.prepare(model, inplace=True)
101103
# Calibrate first
@@ -264,6 +266,12 @@ def get_args_parser(add_help=True):
264266
parser.add_argument("--clip-grad-norm", default=None, type=float, help="the maximum gradient norm (default None)")
265267

266268
# Prototype models only
269+
parser.add_argument(
270+
"--prototype",
271+
dest="prototype",
272+
help="Use prototype model builders instead those from main area",
273+
action="store_true",
274+
)
267275
parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load")
268276

269277
return parser

references/classification/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ def store_model_weights(model, checkpoint_path, checkpoint_key="model", strict=T
344344
345345
# Quantized Classification
346346
model = M.quantization.mobilenet_v3_large(pretrained=False, quantize=False)
347-
model.fuse_model()
347+
model.fuse_model(is_qat=True)
348348
model.qconfig = torch.ao.quantization.get_default_qat_qconfig('qnnpack')
349349
_ = torch.ao.quantization.prepare_qat(model, inplace=True)
350350
print(store_model_weights(model, './qat.pth'))

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ def write_version_file():
5858
pytorch_dep += "==" + os.getenv("PYTORCH_VERSION")
5959

6060
requirements = [
61+
"typing_extensions",
6162
"numpy",
6263
"requests",
6364
pytorch_dep,
Binary file not shown.
Binary file not shown.
Binary file not shown.
939 Bytes
Binary file not shown.

test/test_models.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import warnings
99
from collections import OrderedDict
1010
from tempfile import TemporaryDirectory
11+
from typing import Any
1112

1213
import pytest
1314
import torch
@@ -514,6 +515,35 @@ def test_generalizedrcnn_transform_repr():
514515
assert t.__repr__() == expected_string
515516

516517

518+
test_vit_conv_stem_configs = [
519+
models.vision_transformer.ConvStemConfig(kernel_size=3, stride=2, out_channels=64),
520+
models.vision_transformer.ConvStemConfig(kernel_size=3, stride=2, out_channels=128),
521+
models.vision_transformer.ConvStemConfig(kernel_size=3, stride=1, out_channels=128),
522+
models.vision_transformer.ConvStemConfig(kernel_size=3, stride=2, out_channels=256),
523+
models.vision_transformer.ConvStemConfig(kernel_size=3, stride=1, out_channels=256),
524+
models.vision_transformer.ConvStemConfig(kernel_size=3, stride=2, out_channels=512),
525+
]
526+
527+
528+
def vitc_b_16(**kwargs: Any):
529+
return models.VisionTransformer(
530+
image_size=224,
531+
patch_size=16,
532+
num_layers=12,
533+
num_heads=12,
534+
hidden_dim=768,
535+
mlp_dim=3072,
536+
conv_stem_configs=test_vit_conv_stem_configs,
537+
**kwargs,
538+
)
539+
540+
541+
@pytest.mark.parametrize("model_fn", [vitc_b_16])
542+
@pytest.mark.parametrize("dev", cpu_and_gpu())
543+
def test_vitc_models(model_fn, dev):
544+
test_classification_model(model_fn, dev)
545+
546+
517547
@pytest.mark.parametrize("model_fn", get_models_from_module(models))
518548
@pytest.mark.parametrize("dev", cpu_and_gpu())
519549
def test_classification_model(model_fn, dev):
@@ -803,7 +833,7 @@ def test_quantized_classification_model(model_fn):
803833
model.train()
804834
model.qconfig = torch.ao.quantization.default_qat_qconfig
805835

806-
model.fuse_model()
836+
model.fuse_model(is_qat=not eval_mode)
807837
if eval_mode:
808838
torch.ao.quantization.prepare(model, inplace=True)
809839
else:

test/test_utils.py

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -317,29 +317,42 @@ def test_draw_keypoints_errors():
317317
utils.draw_keypoints(image=img, keypoints=invalid_keypoints)
318318

319319

320-
def test_flow_to_image():
320+
@pytest.mark.parametrize("batch", (True, False))
321+
def test_flow_to_image(batch):
321322
h, w = 100, 100
322323
flow = torch.meshgrid(torch.arange(h), torch.arange(w), indexing="ij")
323324
flow = torch.stack(flow[::-1], dim=0).float()
324325
flow[0] -= h / 2
325326
flow[1] -= w / 2
327+
328+
if batch:
329+
flow = torch.stack([flow, flow])
330+
326331
img = utils.flow_to_image(flow)
332+
assert img.shape == (2, 3, h, w) if batch else (3, h, w)
333+
327334
path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "expected_flow.pt")
328335
expected_img = torch.load(path, map_location="cpu")
329-
assert_equal(expected_img, img)
330336

337+
if batch:
338+
expected_img = torch.stack([expected_img, expected_img])
339+
340+
assert_equal(expected_img, img)
331341

332-
def test_flow_to_image_errors():
333-
wrong_flow1 = torch.full((3, 10, 10), 0, dtype=torch.float)
334-
wrong_flow2 = torch.full((2, 10), 0, dtype=torch.float)
335-
wrong_flow3 = torch.full((2, 10, 30), 0, dtype=torch.int)
336342

337-
with pytest.raises(ValueError, match="Input flow should have shape"):
338-
utils.flow_to_image(flow=wrong_flow1)
339-
with pytest.raises(ValueError, match="Input flow should have shape"):
340-
utils.flow_to_image(flow=wrong_flow2)
341-
with pytest.raises(ValueError, match="Flow should be of dtype torch.float"):
342-
utils.flow_to_image(flow=wrong_flow3)
343+
@pytest.mark.parametrize(
344+
"input_flow, match",
345+
(
346+
(torch.full((3, 10, 10), 0, dtype=torch.float), "Input flow should have shape"),
347+
(torch.full((5, 3, 10, 10), 0, dtype=torch.float), "Input flow should have shape"),
348+
(torch.full((2, 10), 0, dtype=torch.float), "Input flow should have shape"),
349+
(torch.full((5, 2, 10), 0, dtype=torch.float), "Input flow should have shape"),
350+
(torch.full((2, 10, 30), 0, dtype=torch.int), "Flow should be of dtype torch.float"),
351+
),
352+
)
353+
def test_flow_to_image_errors(input_flow, match):
354+
with pytest.raises(ValueError, match=match):
355+
utils.flow_to_image(flow=input_flow)
343356

344357

345358
if __name__ == "__main__":

test/test_video_gpu_decoder.py

Lines changed: 42 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -12,30 +12,31 @@
1212

1313
VIDEO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "videos")
1414

15-
test_videos = [
16-
"RATRACE_wave_f_nm_np1_fr_goo_37.avi",
17-
"TrumanShow_wave_f_nm_np1_fr_med_26.avi",
18-
"v_SoccerJuggling_g23_c01.avi",
19-
"v_SoccerJuggling_g24_c01.avi",
20-
"R6llTwEh07w.mp4",
21-
"SOX5yA1l24A.mp4",
22-
"WUzgd7C1pWA.mp4",
23-
]
24-
2515

2616
@pytest.mark.skipif(_HAS_VIDEO_DECODER is False, reason="Didn't compile with support for gpu decoder")
2717
class TestVideoGPUDecoder:
2818
@pytest.mark.skipif(av is None, reason="PyAV unavailable")
29-
def test_frame_reading(self):
30-
for test_video in test_videos:
31-
full_path = os.path.join(VIDEO_DIR, test_video)
32-
decoder = VideoReader(full_path, device="cuda:0")
33-
with av.open(full_path) as container:
34-
for av_frame in container.decode(container.streams.video[0]):
35-
av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray())
36-
vision_frames = next(decoder)["data"]
37-
mean_delta = torch.mean(torch.abs(av_frames.float() - vision_frames.cpu().float()))
38-
assert mean_delta < 0.75
19+
@pytest.mark.parametrize(
20+
"video_file",
21+
[
22+
"RATRACE_wave_f_nm_np1_fr_goo_37.avi",
23+
"TrumanShow_wave_f_nm_np1_fr_med_26.avi",
24+
"v_SoccerJuggling_g23_c01.avi",
25+
"v_SoccerJuggling_g24_c01.avi",
26+
"R6llTwEh07w.mp4",
27+
"SOX5yA1l24A.mp4",
28+
"WUzgd7C1pWA.mp4",
29+
],
30+
)
31+
def test_frame_reading(self, video_file):
32+
full_path = os.path.join(VIDEO_DIR, video_file)
33+
decoder = VideoReader(full_path, device="cuda:0")
34+
with av.open(full_path) as container:
35+
for av_frame in container.decode(container.streams.video[0]):
36+
av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray())
37+
vision_frames = next(decoder)["data"]
38+
mean_delta = torch.mean(torch.abs(av_frames.float() - vision_frames.cpu().float()))
39+
assert mean_delta < 0.75
3940

4041
@pytest.mark.skipif(av is None, reason="PyAV unavailable")
4142
@pytest.mark.parametrize("keyframes", [True, False])
@@ -65,16 +66,27 @@ def test_seek_reading(self, keyframes, full_path, duration):
6566
assert mean_delta < 0.75
6667

6768
@pytest.mark.skipif(av is None, reason="PyAV unavailable")
68-
def test_metadata(self):
69-
for test_video in test_videos:
70-
full_path = os.path.join(VIDEO_DIR, test_video)
71-
decoder = VideoReader(full_path, device="cuda:0")
72-
video_metadata = decoder.get_metadata()["video"]
73-
with av.open(full_path) as container:
74-
video = container.streams.video[0]
75-
av_duration = float(video.duration * video.time_base)
76-
assert math.isclose(video_metadata["duration"], av_duration, rel_tol=1e-2)
77-
assert math.isclose(video_metadata["fps"], video.base_rate, rel_tol=1e-2)
69+
@pytest.mark.parametrize(
70+
"video_file",
71+
[
72+
"RATRACE_wave_f_nm_np1_fr_goo_37.avi",
73+
"TrumanShow_wave_f_nm_np1_fr_med_26.avi",
74+
"v_SoccerJuggling_g23_c01.avi",
75+
"v_SoccerJuggling_g24_c01.avi",
76+
"R6llTwEh07w.mp4",
77+
"SOX5yA1l24A.mp4",
78+
"WUzgd7C1pWA.mp4",
79+
],
80+
)
81+
def test_metadata(self, video_file):
82+
full_path = os.path.join(VIDEO_DIR, video_file)
83+
decoder = VideoReader(full_path, device="cuda:0")
84+
video_metadata = decoder.get_metadata()["video"]
85+
with av.open(full_path) as container:
86+
video = container.streams.video[0]
87+
av_duration = float(video.duration * video.time_base)
88+
assert math.isclose(video_metadata["duration"], av_duration, rel_tol=1e-2)
89+
assert math.isclose(video_metadata["fps"], video.base_rate, rel_tol=1e-2)
7890

7991

8092
if __name__ == "__main__":

torchvision/csrc/io/decoder/gpu/decoder.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,16 +36,6 @@ class Decoder {
3636
void release();
3737
void decode(const uint8_t*, unsigned long);
3838
torch::Tensor fetch_frame();
39-
int get_frame_size() const {
40-
return get_width() * (luma_height + (chroma_height * num_chroma_planes)) *
41-
bytes_per_pixel;
42-
}
43-
int get_width() const {
44-
return (video_output_format == cudaVideoSurfaceFormat_NV12 ||
45-
video_output_format == cudaVideoSurfaceFormat_P016)
46-
? (width + 1) & ~1
47-
: width;
48-
}
4939
int get_height() const {
5040
return luma_height;
5141
}

0 commit comments

Comments
 (0)