Skip to content

Commit 0a15c08

Browse files
committed
Merge branch 'main' of github.com:pytorch/vision into proto-bbox-center-crop
2 parents 28c380d + f079f5a commit 0a15c08

16 files changed

+437
-30
lines changed

docs/source/models.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ You can construct a model with random weights by calling its constructor:
6161
mobilenet_v3_large = models.mobilenet_v3_large()
6262
mobilenet_v3_small = models.mobilenet_v3_small()
6363
resnext50_32x4d = models.resnext50_32x4d()
64+
resnext101_32x8d = models.resnext101_32x8d()
65+
resnext101_64x4d = models.resnext101_64x4d()
6466
wide_resnet50_2 = models.wide_resnet50_2()
6567
mnasnet = models.mnasnet1_0()
6668
efficientnet_b0 = models.efficientnet_b0()
@@ -185,6 +187,7 @@ MobileNet V3 Large 74.042 91.340
185187
MobileNet V3 Small 67.668 87.402
186188
ResNeXt-50-32x4d 77.618 93.698
187189
ResNeXt-101-32x8d 79.312 94.526
190+
ResNeXt-101-64x4d 83.246 96.454
188191
Wide ResNet-50-2 78.468 94.086
189192
Wide ResNet-101-2 78.848 94.284
190193
MNASNet 1.0 73.456 91.510
@@ -366,6 +369,7 @@ ResNext
366369

367370
resnext50_32x4d
368371
resnext101_32x8d
372+
resnext101_64x4d
369373

370374
Wide ResNet
371375
-----------
@@ -481,8 +485,11 @@ a model with random weights by calling its constructor:
481485
resnet18 = models.quantization.resnet18()
482486
resnet50 = models.quantization.resnet50()
483487
resnext101_32x8d = models.quantization.resnext101_32x8d()
488+
resnext101_64x4d = models.quantization.resnext101_64x4d()
484489
shufflenet_v2_x0_5 = models.quantization.shufflenet_v2_x0_5()
485490
shufflenet_v2_x1_0 = models.quantization.shufflenet_v2_x1_0()
491+
shufflenet_v2_x1_5 = models.quantization.shufflenet_v2_x1_5()
492+
shufflenet_v2_x2_0 = models.quantization.shufflenet_v2_x2_0()
486493
487494
Obtaining a pre-trained quantized model can be done with a few lines of code:
488495

@@ -508,6 +515,7 @@ ShuffleNet V2 x2.0 75.354 92.488
508515
ResNet 18 69.494 88.882
509516
ResNet 50 75.920 92.814
510517
ResNext 101 32x8d 78.986 94.480
518+
ResNext 101 64x4d 82.898 96.326
511519
Inception V3 77.176 93.354
512520
GoogleNet 69.826 89.404
513521
================================ ============= =============
Binary file not shown.
Binary file not shown.

test/test_models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,7 @@ def _check_input_backprop(model, inputs):
315315
"convnext_base",
316316
"convnext_large",
317317
"resnext101_32x8d",
318+
"resnext101_64x4d",
318319
"wide_resnet101_2",
319320
"efficientnet_b6",
320321
"efficientnet_b7",

test/test_onnx.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -412,12 +412,13 @@ def forward(self_module, images, features):
412412
def get_image(self, rel_path: str, size: Tuple[int, int]) -> torch.Tensor:
413413
import os
414414

415+
import torchvision.transforms._pil_constants as _pil_constants
415416
from PIL import Image
416417
from torchvision.transforms import functional as F
417418

418419
data_dir = os.path.join(os.path.dirname(__file__), "assets")
419420
path = os.path.join(data_dir, *rel_path.split("/"))
420-
image = Image.open(path).convert("RGB").resize(size, Image.BILINEAR)
421+
image = Image.open(path).convert("RGB").resize(size, _pil_constants.BILINEAR)
421422

422423
return F.convert_image_dtype(F.pil_to_tensor(image))
423424

test/test_prototype_transforms_functional.py

Lines changed: 192 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111
from torch.nn.functional import one_hot
1212
from torchvision.prototype import features
1313
from torchvision.prototype.transforms.functional._meta import convert_bounding_box_format
14+
from torchvision.transforms.functional import _get_perspective_coeffs
1415
from torchvision.transforms.functional_tensor import _max_value as get_max_value
1516

17+
1618
make_tensor = functools.partial(torch.testing.make_tensor, device="cpu")
1719

1820

@@ -380,6 +382,37 @@ def pad_segmentation_mask():
380382
yield SampleInput(mask, padding=padding, padding_mode=padding_mode)
381383

382384

385+
@register_kernel_info_from_sample_inputs_fn
386+
def perspective_bounding_box():
387+
for bounding_box, perspective_coeffs in itertools.product(
388+
make_bounding_boxes(),
389+
[
390+
[1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018],
391+
[0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063],
392+
],
393+
):
394+
yield SampleInput(
395+
bounding_box,
396+
format=bounding_box.format,
397+
perspective_coeffs=perspective_coeffs,
398+
)
399+
400+
401+
@register_kernel_info_from_sample_inputs_fn
402+
def perspective_segmentation_mask():
403+
for mask, perspective_coeffs in itertools.product(
404+
make_segmentation_masks(extra_dims=((), (4,))),
405+
[
406+
[1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018],
407+
[0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063],
408+
],
409+
):
410+
yield SampleInput(
411+
mask,
412+
perspective_coeffs=perspective_coeffs,
413+
)
414+
415+
383416
@register_kernel_info_from_sample_inputs_fn
384417
def center_crop_bounding_box():
385418
for bounding_box, output_size in itertools.product(make_bounding_boxes(), [(24, 12), [16, 18], [46, 48], [12]]):
@@ -993,7 +1026,7 @@ def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device):
9931026
],
9941027
)
9951028
def test_correctness_resized_crop_bounding_box(device, format, top, left, height, width, size):
996-
def _compute_expected(bbox, top_, left_, height_, width_, size_):
1029+
def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_):
9971030
# bbox should be xyxy
9981031
bbox[0] = (bbox[0] - left_) * size_[1] / width_
9991032
bbox[1] = (bbox[1] - top_) * size_[0] / height_
@@ -1009,7 +1042,7 @@ def _compute_expected(bbox, top_, left_, height_, width_, size_):
10091042
]
10101043
expected_bboxes = []
10111044
for in_box in in_boxes:
1012-
expected_bboxes.append(_compute_expected(list(in_box), top, left, height, width, size))
1045+
expected_bboxes.append(_compute_expected_bbox(list(in_box), top, left, height, width, size))
10131046
expected_bboxes = torch.tensor(expected_bboxes, device=device)
10141047

10151048
in_boxes = features.BoundingBox(
@@ -1035,7 +1068,7 @@ def _compute_expected(bbox, top_, left_, height_, width_, size_):
10351068
],
10361069
)
10371070
def test_correctness_resized_crop_segmentation_mask(device, top, left, height, width, size):
1038-
def _compute_expected(mask, top_, left_, height_, width_, size_):
1071+
def _compute_expected_mask(mask, top_, left_, height_, width_, size_):
10391072
output = mask.clone()
10401073
output = output[:, top_ : top_ + height_, left_ : left_ + width_]
10411074
output = torch.nn.functional.interpolate(output[None, :].float(), size=size_, mode="nearest")
@@ -1046,7 +1079,7 @@ def _compute_expected(mask, top_, left_, height_, width_, size_):
10461079
in_mask[0, 10:20, 10:20] = 1
10471080
in_mask[0, 5:15, 12:23] = 2
10481081

1049-
expected_mask = _compute_expected(in_mask, top, left, height, width, size)
1082+
expected_mask = _compute_expected_mask(in_mask, top, left, height, width, size)
10501083
output_mask = F.resized_crop_segmentation_mask(in_mask, top, left, height, width, size)
10511084
torch.testing.assert_close(output_mask, expected_mask)
10521085

@@ -1095,6 +1128,161 @@ def parse_padding():
10951128
torch.testing.assert_close(out_mask, expected_mask)
10961129

10971130

1131+
@pytest.mark.parametrize("device", cpu_and_gpu())
1132+
@pytest.mark.parametrize(
1133+
"startpoints, endpoints",
1134+
[
1135+
[[[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]],
1136+
[[[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]],
1137+
[[[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]],
1138+
],
1139+
)
1140+
def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
1141+
def _compute_expected_bbox(bbox, pcoeffs_):
1142+
m1 = np.array(
1143+
[
1144+
[pcoeffs_[0], pcoeffs_[1], pcoeffs_[2]],
1145+
[pcoeffs_[3], pcoeffs_[4], pcoeffs_[5]],
1146+
]
1147+
)
1148+
m2 = np.array(
1149+
[
1150+
[pcoeffs_[6], pcoeffs_[7], 1.0],
1151+
[pcoeffs_[6], pcoeffs_[7], 1.0],
1152+
]
1153+
)
1154+
1155+
bbox_xyxy = convert_bounding_box_format(
1156+
bbox, old_format=bbox.format, new_format=features.BoundingBoxFormat.XYXY
1157+
)
1158+
points = np.array(
1159+
[
1160+
[bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
1161+
[bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0],
1162+
[bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0],
1163+
[bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
1164+
]
1165+
)
1166+
numer = np.matmul(points, m1.T)
1167+
denom = np.matmul(points, m2.T)
1168+
transformed_points = numer / denom
1169+
out_bbox = [
1170+
np.min(transformed_points[:, 0]),
1171+
np.min(transformed_points[:, 1]),
1172+
np.max(transformed_points[:, 0]),
1173+
np.max(transformed_points[:, 1]),
1174+
]
1175+
out_bbox = features.BoundingBox(
1176+
out_bbox,
1177+
format=features.BoundingBoxFormat.XYXY,
1178+
image_size=bbox.image_size,
1179+
dtype=torch.float32,
1180+
device=bbox.device,
1181+
)
1182+
return convert_bounding_box_format(
1183+
out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False
1184+
)
1185+
1186+
image_size = (32, 38)
1187+
1188+
pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
1189+
inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints)
1190+
1191+
for bboxes in make_bounding_boxes(
1192+
image_sizes=[
1193+
image_size,
1194+
],
1195+
extra_dims=((4,),),
1196+
):
1197+
bboxes = bboxes.to(device)
1198+
bboxes_format = bboxes.format
1199+
bboxes_image_size = bboxes.image_size
1200+
1201+
output_bboxes = F.perspective_bounding_box(
1202+
bboxes,
1203+
bboxes_format,
1204+
perspective_coeffs=pcoeffs,
1205+
)
1206+
1207+
if bboxes.ndim < 2:
1208+
bboxes = [bboxes]
1209+
1210+
expected_bboxes = []
1211+
for bbox in bboxes:
1212+
bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
1213+
expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs))
1214+
if len(expected_bboxes) > 1:
1215+
expected_bboxes = torch.stack(expected_bboxes)
1216+
else:
1217+
expected_bboxes = expected_bboxes[0]
1218+
torch.testing.assert_close(output_bboxes, expected_bboxes, rtol=1e-5, atol=1e-5)
1219+
1220+
1221+
@pytest.mark.parametrize("device", cpu_and_gpu())
1222+
@pytest.mark.parametrize(
1223+
"startpoints, endpoints",
1224+
[
1225+
[[[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]],
1226+
[[[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]],
1227+
[[[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]],
1228+
],
1229+
)
1230+
def test_correctness_perspective_segmentation_mask(device, startpoints, endpoints):
1231+
def _compute_expected_mask(mask, pcoeffs_):
1232+
assert mask.ndim == 3 and mask.shape[0] == 1
1233+
m1 = np.array(
1234+
[
1235+
[pcoeffs_[0], pcoeffs_[1], pcoeffs_[2]],
1236+
[pcoeffs_[3], pcoeffs_[4], pcoeffs_[5]],
1237+
]
1238+
)
1239+
m2 = np.array(
1240+
[
1241+
[pcoeffs_[6], pcoeffs_[7], 1.0],
1242+
[pcoeffs_[6], pcoeffs_[7], 1.0],
1243+
]
1244+
)
1245+
1246+
expected_mask = torch.zeros_like(mask.cpu())
1247+
for out_y in range(expected_mask.shape[1]):
1248+
for out_x in range(expected_mask.shape[2]):
1249+
output_pt = np.array([out_x + 0.5, out_y + 0.5, 1.0])
1250+
1251+
numer = np.matmul(output_pt, m1.T)
1252+
denom = np.matmul(output_pt, m2.T)
1253+
input_pt = np.floor(numer / denom).astype(np.int32)
1254+
1255+
in_x, in_y = input_pt[:2]
1256+
if 0 <= in_x < mask.shape[2] and 0 <= in_y < mask.shape[1]:
1257+
expected_mask[0, out_y, out_x] = mask[0, in_y, in_x]
1258+
return expected_mask.to(mask.device)
1259+
1260+
pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
1261+
1262+
for mask in make_segmentation_masks(extra_dims=((), (4,))):
1263+
mask = mask.to(device)
1264+
1265+
output_mask = F.perspective_segmentation_mask(
1266+
mask,
1267+
perspective_coeffs=pcoeffs,
1268+
)
1269+
1270+
if mask.ndim < 4:
1271+
masks = [mask]
1272+
else:
1273+
masks = [m for m in mask]
1274+
1275+
expected_masks = []
1276+
for mask in masks:
1277+
expected_mask = _compute_expected_mask(mask, pcoeffs)
1278+
expected_masks.append(expected_mask)
1279+
if len(expected_masks) > 1:
1280+
expected_masks = torch.stack(expected_masks)
1281+
else:
1282+
expected_masks = expected_masks[0]
1283+
torch.testing.assert_close(output_mask, expected_masks)
1284+
1285+
10981286
@pytest.mark.parametrize("device", cpu_and_gpu())
10991287
@pytest.mark.parametrize(
11001288
"output_size",
@@ -1148,5 +1336,4 @@ def _compute_expected_bbox(bbox, output_size_):
11481336
expected_bboxes = torch.stack(expected_bboxes)
11491337
else:
11501338
expected_bboxes = expected_bboxes[0]
1151-
expected_bboxes = expected_bboxes.to(device=device)
11521339
torch.testing.assert_close(output_boxes, expected_bboxes)

test/test_transforms.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import pytest
99
import torch
1010
import torchvision.transforms as transforms
11+
import torchvision.transforms._pil_constants as _pil_constants
1112
import torchvision.transforms.functional as F
1213
import torchvision.transforms.functional_tensor as F_t
1314
from PIL import Image
@@ -173,7 +174,7 @@ def test_accimage_pil_to_tensor(self):
173174
def test_accimage_resize(self):
174175
trans = transforms.Compose(
175176
[
176-
transforms.Resize(256, interpolation=Image.LINEAR),
177+
transforms.Resize(256, interpolation=_pil_constants.LINEAR),
177178
transforms.PILToTensor(),
178179
transforms.ConvertImageDtype(dtype=torch.float),
179180
]

test/test_transforms_tensor.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55
import pytest
66
import torch
7+
import torchvision.transforms._pil_constants as _pil_constants
78
from common_utils import (
89
get_tmp_dir,
910
int_dtypes,
@@ -15,7 +16,6 @@
1516
cpu_and_gpu,
1617
assert_equal,
1718
)
18-
from PIL import Image
1919
from torchvision import transforms as T
2020
from torchvision.transforms import InterpolationMode
2121
from torchvision.transforms import functional as F
@@ -771,13 +771,13 @@ def shear(pil_img, level, mode, resample):
771771
matrix = (1, level, 0, 0, 1, 0)
772772
elif mode == "Y":
773773
matrix = (1, 0, 0, level, 1, 0)
774-
return pil_img.transform((image_size, image_size), Image.AFFINE, matrix, resample=resample)
774+
return pil_img.transform((image_size, image_size), _pil_constants.AFFINE, matrix, resample=resample)
775775

776776
t_img, pil_img = _create_data(image_size, image_size)
777777

778778
resample_pil = {
779-
F.InterpolationMode.NEAREST: Image.NEAREST,
780-
F.InterpolationMode.BILINEAR: Image.BILINEAR,
779+
F.InterpolationMode.NEAREST: _pil_constants.NEAREST,
780+
F.InterpolationMode.BILINEAR: _pil_constants.BILINEAR,
781781
}[interpolation]
782782

783783
level = 0.3

0 commit comments

Comments
 (0)