diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py index 1d5766b1fcf..220a793ac9d 100644 --- a/test/prototype_common_utils.py +++ b/test/prototype_common_utils.py @@ -184,13 +184,18 @@ def load(self, device="cpu"): return args, kwargs -DEFAULT_SQUARE_IMAGE_SIZE = 15 -DEFAULT_LANDSCAPE_IMAGE_SIZE = (7, 33) -DEFAULT_PORTRAIT_IMAGE_SIZE = (31, 9) -DEFAULT_IMAGE_SIZES = (DEFAULT_LANDSCAPE_IMAGE_SIZE, DEFAULT_PORTRAIT_IMAGE_SIZE, DEFAULT_SQUARE_IMAGE_SIZE, "random") +DEFAULT_SQUARE_SPATIAL_SIZE = 15 +DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33) +DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9) +DEFAULT_SPATIAL_SIZES = ( + DEFAULT_LANDSCAPE_SPATIAL_SIZE, + DEFAULT_PORTRAIT_SPATIAL_SIZE, + DEFAULT_SQUARE_SPATIAL_SIZE, + "random", +) -def _parse_image_size(size, *, name="size"): +def _parse_spatial_size(size, *, name="size"): if size == "random": return tuple(torch.randint(15, 33, (2,)).tolist()) elif isinstance(size, int) and size > 0: @@ -246,11 +251,11 @@ def load(self, device): @dataclasses.dataclass class ImageLoader(TensorLoader): color_space: features.ColorSpace - image_size: Tuple[int, int] = dataclasses.field(init=False) + spatial_size: Tuple[int, int] = dataclasses.field(init=False) num_channels: int = dataclasses.field(init=False) def __post_init__(self): - self.image_size = self.shape[-2:] + self.spatial_size = self.shape[-2:] self.num_channels = self.shape[-3] @@ -277,7 +282,7 @@ def make_image_loader( dtype=torch.float32, constant_alpha=True, ): - size = _parse_image_size(size) + size = _parse_spatial_size(size) num_channels = get_num_channels(color_space) def fn(shape, dtype, device): @@ -295,7 +300,7 @@ def fn(shape, dtype, device): def make_image_loaders( *, - sizes=DEFAULT_IMAGE_SIZES, + sizes=DEFAULT_SPATIAL_SIZES, color_spaces=( features.ColorSpace.GRAY, features.ColorSpace.GRAY_ALPHA, @@ -316,7 +321,7 @@ def make_image_loaders( @dataclasses.dataclass class BoundingBoxLoader(TensorLoader): format: features.BoundingBoxFormat - image_size: Tuple[int, int] + spatial_size: Tuple[int, int] def randint_with_tensor_bounds(arg1, arg2=None, **kwargs): @@ -331,7 +336,7 @@ def randint_with_tensor_bounds(arg1, arg2=None, **kwargs): ).reshape(low.shape) -def make_bounding_box_loader(*, extra_dims=(), format, image_size="random", dtype=torch.float32): +def make_bounding_box_loader(*, extra_dims=(), format, spatial_size="random", dtype=torch.float32): if isinstance(format, str): format = features.BoundingBoxFormat[format] if format not in { @@ -341,7 +346,7 @@ def make_bounding_box_loader(*, extra_dims=(), format, image_size="random", dtyp }: raise pytest.UsageError(f"Can't make bounding box in format {format}") - image_size = _parse_image_size(image_size, name="image_size") + spatial_size = _parse_spatial_size(spatial_size, name="spatial_size") def fn(shape, dtype, device): *extra_dims, num_coordinates = shape @@ -350,10 +355,10 @@ def fn(shape, dtype, device): if any(dim == 0 for dim in extra_dims): return features.BoundingBox( - torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, image_size=image_size + torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size ) - height, width = image_size + height, width = spatial_size if format == features.BoundingBoxFormat.XYXY: x1 = torch.randint(0, width // 2, extra_dims) @@ -375,10 +380,10 @@ def fn(shape, dtype, device): parts = (cx, cy, w, h) return features.BoundingBox( - torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, image_size=image_size + torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size ) - return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, image_size=image_size) + return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size) make_bounding_box = from_loader(make_bounding_box_loader) @@ -388,11 +393,11 @@ def make_bounding_box_loaders( *, extra_dims=DEFAULT_EXTRA_DIMS, formats=tuple(features.BoundingBoxFormat), - image_size="random", + spatial_size="random", dtypes=(torch.float32, torch.int64), ): for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes): - yield make_bounding_box_loader(**params, image_size=image_size) + yield make_bounding_box_loader(**params, spatial_size=spatial_size) make_bounding_boxes = from_loaders(make_bounding_box_loaders) @@ -475,7 +480,7 @@ class MaskLoader(TensorLoader): def make_detection_mask_loader(size="random", *, num_objects="random", extra_dims=(), dtype=torch.uint8): # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects - size = _parse_image_size(size) + size = _parse_spatial_size(size) num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects def fn(shape, dtype, device): @@ -489,7 +494,7 @@ def fn(shape, dtype, device): def make_detection_mask_loaders( - sizes=DEFAULT_IMAGE_SIZES, + sizes=DEFAULT_SPATIAL_SIZES, num_objects=(1, 0, "random"), extra_dims=DEFAULT_EXTRA_DIMS, dtypes=(torch.uint8,), @@ -503,7 +508,7 @@ def make_detection_mask_loaders( def make_segmentation_mask_loader(size="random", *, num_categories="random", extra_dims=(), dtype=torch.uint8): # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values - size = _parse_image_size(size) + size = _parse_spatial_size(size) num_categories = int(torch.randint(1, 11, ())) if num_categories == "random" else num_categories def fn(shape, dtype, device): @@ -518,7 +523,7 @@ def fn(shape, dtype, device): def make_segmentation_mask_loaders( *, - sizes=DEFAULT_IMAGE_SIZES, + sizes=DEFAULT_SPATIAL_SIZES, num_categories=(1, 2, "random"), extra_dims=DEFAULT_EXTRA_DIMS, dtypes=(torch.uint8,), @@ -532,7 +537,7 @@ def make_segmentation_mask_loaders( def make_mask_loaders( *, - sizes=DEFAULT_IMAGE_SIZES, + sizes=DEFAULT_SPATIAL_SIZES, num_objects=(1, 0, "random"), num_categories=(1, 2, "random"), extra_dims=DEFAULT_EXTRA_DIMS, @@ -559,7 +564,7 @@ def make_video_loader( extra_dims=(), dtype=torch.uint8, ): - size = _parse_image_size(size) + size = _parse_spatial_size(size) num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames def fn(shape, dtype, device): @@ -576,7 +581,7 @@ def fn(shape, dtype, device): def make_video_loaders( *, - sizes=DEFAULT_IMAGE_SIZES, + sizes=DEFAULT_SPATIAL_SIZES, color_spaces=( features.ColorSpace.GRAY, features.ColorSpace.RGB, diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index c8cca77e0db..239425d177d 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -145,7 +145,7 @@ def sample_inputs_horizontal_flip_bounding_box(): formats=[features.BoundingBoxFormat.XYXY], dtypes=[torch.float32] ): yield ArgsKwargs( - bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size + bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size ) @@ -185,9 +185,9 @@ def sample_inputs_horizontal_flip_video(): ) -def _get_resize_sizes(image_size): - height, width = image_size - length = max(image_size) +def _get_resize_sizes(spatial_size): + height, width = spatial_size + length = max(spatial_size) yield length yield [length] yield (length,) @@ -201,7 +201,7 @@ def sample_inputs_resize_image_tensor(): for image_loader in make_image_loaders( sizes=["random"], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32] ): - for size in _get_resize_sizes(image_loader.image_size): + for size in _get_resize_sizes(image_loader.spatial_size): yield ArgsKwargs(image_loader, size=size) for image_loader, interpolation in itertools.product( @@ -212,7 +212,7 @@ def sample_inputs_resize_image_tensor(): F.InterpolationMode.BICUBIC, ], ): - yield ArgsKwargs(image_loader, size=[min(image_loader.image_size) + 1], interpolation=interpolation) + yield ArgsKwargs(image_loader, size=[min(image_loader.spatial_size) + 1], interpolation=interpolation) yield ArgsKwargs(make_image_loader(size=(11, 17)), size=20, max_size=25) @@ -236,7 +236,7 @@ def reference_inputs_resize_image_tensor(): F.InterpolationMode.BICUBIC, ], ): - for size in _get_resize_sizes(image_loader.image_size): + for size in _get_resize_sizes(image_loader.spatial_size): yield ArgsKwargs( image_loader, size=size, @@ -251,8 +251,8 @@ def reference_inputs_resize_image_tensor(): def sample_inputs_resize_bounding_box(): for bounding_box_loader in make_bounding_box_loaders(): - for size in _get_resize_sizes(bounding_box_loader.image_size): - yield ArgsKwargs(bounding_box_loader, size=size, image_size=bounding_box_loader.image_size) + for size in _get_resize_sizes(bounding_box_loader.spatial_size): + yield ArgsKwargs(bounding_box_loader, size=size, spatial_size=bounding_box_loader.spatial_size) def sample_inputs_resize_mask(): @@ -394,7 +394,7 @@ def sample_inputs_affine_bounding_box(): yield ArgsKwargs( bounding_box_loader, format=bounding_box_loader.format, - image_size=bounding_box_loader.image_size, + spatial_size=bounding_box_loader.spatial_size, **affine_params, ) @@ -422,9 +422,9 @@ def _compute_affine_matrix(angle, translate, scale, shear, center): return true_matrix -def reference_affine_bounding_box(bounding_box, *, format, image_size, angle, translate, scale, shear, center=None): +def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle, translate, scale, shear, center=None): if center is None: - center = [s * 0.5 for s in image_size[::-1]] + center = [s * 0.5 for s in spatial_size[::-1]] def transform(bbox): affine_matrix = _compute_affine_matrix(angle, translate, scale, shear, center) @@ -473,7 +473,7 @@ def reference_inputs_affine_bounding_box(): yield ArgsKwargs( bounding_box_loader, format=bounding_box_loader.format, - image_size=bounding_box_loader.image_size, + spatial_size=bounding_box_loader.spatial_size, **affine_kwargs, ) @@ -650,7 +650,7 @@ def sample_inputs_vertical_flip_bounding_box(): formats=[features.BoundingBoxFormat.XYXY], dtypes=[torch.float32] ): yield ArgsKwargs( - bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size + bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size ) @@ -729,7 +729,7 @@ def sample_inputs_rotate_bounding_box(): yield ArgsKwargs( bounding_box_loader, format=bounding_box_loader.format, - image_size=bounding_box_loader.image_size, + spatial_size=bounding_box_loader.spatial_size, angle=_ROTATE_ANGLES[0], ) @@ -1001,7 +1001,7 @@ def sample_inputs_pad_bounding_box(): yield ArgsKwargs( bounding_box_loader, format=bounding_box_loader.format, - image_size=bounding_box_loader.image_size, + spatial_size=bounding_box_loader.spatial_size, padding=padding, padding_mode="constant", ) @@ -1131,13 +1131,13 @@ def sample_inputs_perspective_video(): ) -def _get_elastic_displacement(image_size): - return torch.rand(1, *image_size, 2) +def _get_elastic_displacement(spatial_size): + return torch.rand(1, *spatial_size, 2) def sample_inputs_elastic_image_tensor(): for image_loader in make_image_loaders(sizes=["random"]): - displacement = _get_elastic_displacement(image_loader.image_size) + displacement = _get_elastic_displacement(image_loader.spatial_size) for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]: yield ArgsKwargs(image_loader, displacement=displacement, fill=fill) @@ -1151,14 +1151,14 @@ def reference_inputs_elastic_image_tensor(): F.InterpolationMode.BICUBIC, ], ): - displacement = _get_elastic_displacement(image_loader.image_size) + displacement = _get_elastic_displacement(image_loader.spatial_size) for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]: yield ArgsKwargs(image_loader, interpolation=interpolation, displacement=displacement, fill=fill) def sample_inputs_elastic_bounding_box(): for bounding_box_loader in make_bounding_box_loaders(): - displacement = _get_elastic_displacement(bounding_box_loader.image_size) + displacement = _get_elastic_displacement(bounding_box_loader.spatial_size) yield ArgsKwargs( bounding_box_loader, format=bounding_box_loader.format, @@ -1212,7 +1212,7 @@ def sample_inputs_elastic_video(): ) -_CENTER_CROP_IMAGE_SIZES = [(16, 16), (7, 33), (31, 9)] +_CENTER_CROP_SPATIAL_SIZES = [(16, 16), (7, 33), (31, 9)] _CENTER_CROP_OUTPUT_SIZES = [[4, 3], [42, 70], [4], 3, (5, 2), (6,)] @@ -1231,7 +1231,7 @@ def sample_inputs_center_crop_image_tensor(): def reference_inputs_center_crop_image_tensor(): for image_loader, output_size in itertools.product( - make_image_loaders(sizes=_CENTER_CROP_IMAGE_SIZES, extra_dims=[()]), _CENTER_CROP_OUTPUT_SIZES + make_image_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()]), _CENTER_CROP_OUTPUT_SIZES ): yield ArgsKwargs(image_loader, output_size=output_size) @@ -1241,7 +1241,7 @@ def sample_inputs_center_crop_bounding_box(): yield ArgsKwargs( bounding_box_loader, format=bounding_box_loader.format, - image_size=bounding_box_loader.image_size, + spatial_size=bounding_box_loader.spatial_size, output_size=output_size, ) @@ -1254,7 +1254,7 @@ def sample_inputs_center_crop_mask(): def reference_inputs_center_crop_mask(): for mask_loader, output_size in itertools.product( - make_mask_loaders(sizes=_CENTER_CROP_IMAGE_SIZES, extra_dims=[()], num_objects=[1]), _CENTER_CROP_OUTPUT_SIZES + make_mask_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()], num_objects=[1]), _CENTER_CROP_OUTPUT_SIZES ): yield ArgsKwargs(mask_loader, output_size=output_size) @@ -1820,7 +1820,7 @@ def sample_inputs_adjust_saturation_video(): def sample_inputs_clamp_bounding_box(): for bounding_box_loader in make_bounding_box_loaders(): yield ArgsKwargs( - bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size + bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size ) @@ -1834,7 +1834,7 @@ def sample_inputs_clamp_bounding_box(): _FIVE_TEN_CROP_SIZES = [7, (6,), [5], (6, 5), [7, 6]] -def _get_five_ten_crop_image_size(size): +def _get_five_ten_crop_spatial_size(size): if isinstance(size, int): crop_height = crop_width = size elif len(size) == 1: @@ -1847,28 +1847,32 @@ def _get_five_ten_crop_image_size(size): def sample_inputs_five_crop_image_tensor(): for size in _FIVE_TEN_CROP_SIZES: for image_loader in make_image_loaders( - sizes=[_get_five_ten_crop_image_size(size)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32] + sizes=[_get_five_ten_crop_spatial_size(size)], + color_spaces=[features.ColorSpace.RGB], + dtypes=[torch.float32], ): yield ArgsKwargs(image_loader, size=size) def reference_inputs_five_crop_image_tensor(): for size in _FIVE_TEN_CROP_SIZES: - for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)], extra_dims=[()]): + for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()]): yield ArgsKwargs(image_loader, size=size) def sample_inputs_ten_crop_image_tensor(): for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]): for image_loader in make_image_loaders( - sizes=[_get_five_ten_crop_image_size(size)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32] + sizes=[_get_five_ten_crop_spatial_size(size)], + color_spaces=[features.ColorSpace.RGB], + dtypes=[torch.float32], ): yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip) def reference_inputs_ten_crop_image_tensor(): for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]): - for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)], extra_dims=[()]): + for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()]): yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index d7a41e7c12c..2c095fa6e81 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -298,7 +298,7 @@ def test_features_mask(self, p): assert_equal(features.Mask(expected), actual) def test_features_bounding_box(self, p): - input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, image_size=(10, 10)) + input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, spatial_size=(10, 10)) transform = transforms.RandomHorizontalFlip(p=p) actual = transform(input) @@ -307,7 +307,7 @@ def test_features_bounding_box(self, p): expected = features.BoundingBox.wrap_like(input, expected_image_tensor) assert_equal(expected, actual) assert actual.format == expected.format - assert actual.image_size == expected.image_size + assert actual.spatial_size == expected.spatial_size @pytest.mark.parametrize("p", [0.0, 1.0]) @@ -351,7 +351,7 @@ def test_features_mask(self, p): assert_equal(features.Mask(expected), actual) def test_features_bounding_box(self, p): - input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, image_size=(10, 10)) + input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, spatial_size=(10, 10)) transform = transforms.RandomVerticalFlip(p=p) actual = transform(input) @@ -360,7 +360,7 @@ def test_features_bounding_box(self, p): expected = features.BoundingBox.wrap_like(input, expected_image_tensor) assert_equal(expected, actual) assert actual.format == expected.format - assert actual.image_size == expected.image_size + assert actual.spatial_size == expected.spatial_size class TestPad: @@ -435,7 +435,7 @@ def test__get_params(self, fill, side_range, mocker): transform = transforms.RandomZoomOut(fill=fill, side_range=side_range) image = mocker.MagicMock(spec=features.Image) - h, w = image.image_size = (24, 32) + h, w = image.spatial_size = (24, 32) params = transform._get_params(image) @@ -450,7 +450,7 @@ def test__get_params(self, fill, side_range, mocker): def test__transform(self, fill, side_range, mocker): inpt = mocker.MagicMock(spec=features.Image) inpt.num_channels = 3 - inpt.image_size = (24, 32) + inpt.spatial_size = (24, 32) transform = transforms.RandomZoomOut(fill=fill, side_range=side_range, p=1) @@ -559,17 +559,17 @@ def test__transform(self, degrees, expand, fill, center, mocker): @pytest.mark.parametrize("angle", [34, -87]) @pytest.mark.parametrize("expand", [False, True]) - def test_boundingbox_image_size(self, angle, expand): + def test_boundingbox_spatial_size(self, angle, expand): # Specific test for BoundingBox.rotate bbox = features.BoundingBox( - torch.tensor([1, 2, 3, 4]), format=features.BoundingBoxFormat.XYXY, image_size=(32, 32) + torch.tensor([1, 2, 3, 4]), format=features.BoundingBoxFormat.XYXY, spatial_size=(32, 32) ) img = features.Image(torch.rand(1, 3, 32, 32)) out_img = img.rotate(angle, expand=expand) out_bbox = bbox.rotate(angle, expand=expand) - assert out_img.image_size == out_bbox.image_size + assert out_img.spatial_size == out_bbox.spatial_size class TestRandomAffine: @@ -619,8 +619,8 @@ def test_assertions(self): def test__get_params(self, degrees, translate, scale, shear, mocker): image = mocker.MagicMock(spec=features.Image) image.num_channels = 3 - image.image_size = (24, 32) - h, w = image.image_size + image.spatial_size = (24, 32) + h, w = image.spatial_size transform = transforms.RandomAffine(degrees, translate=translate, scale=scale, shear=shear) params = transform._get_params(image) @@ -682,7 +682,7 @@ def test__transform(self, degrees, translate, scale, shear, fill, center, mocker fn = mocker.patch("torchvision.prototype.transforms.functional.affine") inpt = mocker.MagicMock(spec=features.Image) inpt.num_channels = 3 - inpt.image_size = (24, 32) + inpt.spatial_size = (24, 32) # vfdev-5, Feature Request: let's store params as Transform attribute # This could be also helpful for users @@ -718,8 +718,8 @@ def test_assertions(self): def test__get_params(self, padding, pad_if_needed, size, mocker): image = mocker.MagicMock(spec=features.Image) image.num_channels = 3 - image.image_size = (24, 32) - h, w = image.image_size + image.spatial_size = (24, 32) + h, w = image.spatial_size transform = transforms.RandomCrop(size, padding=padding, pad_if_needed=pad_if_needed) params = transform._get_params(image) @@ -771,19 +771,19 @@ def test__transform(self, padding, pad_if_needed, fill, padding_mode, mocker): inpt = mocker.MagicMock(spec=features.Image) inpt.num_channels = 3 - inpt.image_size = (32, 32) + inpt.spatial_size = (32, 32) expected = mocker.MagicMock(spec=features.Image) expected.num_channels = 3 if isinstance(padding, int): - expected.image_size = (inpt.image_size[0] + padding, inpt.image_size[1] + padding) + expected.spatial_size = (inpt.spatial_size[0] + padding, inpt.spatial_size[1] + padding) elif isinstance(padding, list): - expected.image_size = ( - inpt.image_size[0] + sum(padding[0::2]), - inpt.image_size[1] + sum(padding[1::2]), + expected.spatial_size = ( + inpt.spatial_size[0] + sum(padding[0::2]), + inpt.spatial_size[1] + sum(padding[1::2]), ) else: - expected.image_size = inpt.image_size + expected.spatial_size = inpt.spatial_size _ = mocker.patch("torchvision.prototype.transforms.functional.pad", return_value=expected) fn_crop = mocker.patch("torchvision.prototype.transforms.functional.crop") @@ -859,7 +859,7 @@ def test__transform(self, kernel_size, sigma, mocker): fn = mocker.patch("torchvision.prototype.transforms.functional.gaussian_blur") inpt = mocker.MagicMock(spec=features.Image) inpt.num_channels = 3 - inpt.image_size = (24, 32) + inpt.spatial_size = (24, 32) # vfdev-5, Feature Request: let's store params as Transform attribute # This could be also helpful for users @@ -910,11 +910,11 @@ def test__get_params(self, mocker): transform = transforms.RandomPerspective(dscale) image = mocker.MagicMock(spec=features.Image) image.num_channels = 3 - image.image_size = (24, 32) + image.spatial_size = (24, 32) params = transform._get_params(image) - h, w = image.image_size + h, w = image.spatial_size assert "perspective_coeffs" in params assert len(params["perspective_coeffs"]) == 8 @@ -927,7 +927,7 @@ def test__transform(self, distortion_scale, mocker): fn = mocker.patch("torchvision.prototype.transforms.functional.perspective") inpt = mocker.MagicMock(spec=features.Image) inpt.num_channels = 3 - inpt.image_size = (24, 32) + inpt.spatial_size = (24, 32) # vfdev-5, Feature Request: let's store params as Transform attribute # This could be also helpful for users # Otherwise, we can mock transform._get_params @@ -971,11 +971,11 @@ def test__get_params(self, mocker): transform = transforms.ElasticTransform(alpha, sigma) image = mocker.MagicMock(spec=features.Image) image.num_channels = 3 - image.image_size = (24, 32) + image.spatial_size = (24, 32) params = transform._get_params(image) - h, w = image.image_size + h, w = image.spatial_size displacement = params["displacement"] assert displacement.shape == (1, h, w, 2) assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all() @@ -1001,7 +1001,7 @@ def test__transform(self, alpha, sigma, mocker): fn = mocker.patch("torchvision.prototype.transforms.functional.elastic") inpt = mocker.MagicMock(spec=features.Image) inpt.num_channels = 3 - inpt.image_size = (24, 32) + inpt.spatial_size = (24, 32) # Let's mock transform._get_params to control the output: transform._get_params = mocker.MagicMock() @@ -1030,7 +1030,7 @@ def test_assertions(self, mocker): image = mocker.MagicMock(spec=features.Image) image.num_channels = 3 - image.image_size = (24, 32) + image.spatial_size = (24, 32) transform = transforms.RandomErasing(value=[1, 2, 3, 4]) @@ -1041,7 +1041,7 @@ def test_assertions(self, mocker): def test__get_params(self, value, mocker): image = mocker.MagicMock(spec=features.Image) image.num_channels = 3 - image.image_size = (24, 32) + image.spatial_size = (24, 32) transform = transforms.RandomErasing(value=value) params = transform._get_params(image) @@ -1057,8 +1057,8 @@ def test__get_params(self, value, mocker): elif isinstance(value, (list, tuple)): assert v.shape == (image.num_channels, 1, 1) - assert 0 <= i <= image.image_size[0] - h - assert 0 <= j <= image.image_size[1] - w + assert 0 <= i <= image.spatial_size[0] - h + assert 0 <= j <= image.spatial_size[1] - w @pytest.mark.parametrize("p", [0, 1]) def test__transform(self, mocker, p): @@ -1222,11 +1222,11 @@ class TestRandomIoUCrop: def test__get_params(self, device, options, mocker): image = mocker.MagicMock(spec=features.Image) image.num_channels = 3 - image.image_size = (24, 32) + image.spatial_size = (24, 32) bboxes = features.BoundingBox( torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]), format="XYXY", - image_size=image.image_size, + spatial_size=image.spatial_size, device=device, ) sample = [image, bboxes] @@ -1245,8 +1245,8 @@ def test__get_params(self, device, options, mocker): assert len(params["is_within_crop_area"]) > 0 assert params["is_within_crop_area"].dtype == torch.bool - orig_h = image.image_size[0] - orig_w = image.image_size[1] + orig_h = image.spatial_size[0] + orig_w = image.spatial_size[1] assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h) assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w) @@ -1261,7 +1261,7 @@ def test__get_params(self, device, options, mocker): def test__transform_empty_params(self, mocker): transform = transforms.RandomIoUCrop(sampler_options=[2.0]) image = features.Image(torch.rand(1, 3, 4, 4)) - bboxes = features.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", image_size=(4, 4)) + bboxes = features.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4)) label = features.Label(torch.tensor([1])) sample = [image, bboxes, label] # Let's mock transform._get_params to control the output: @@ -1281,7 +1281,7 @@ def test__transform(self, mocker): transform = transforms.RandomIoUCrop() image = features.Image(torch.rand(3, 32, 24)) - bboxes = make_bounding_box(format="XYXY", image_size=(32, 24), extra_dims=(6,)) + bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), extra_dims=(6,)) label = features.Label(torch.randint(0, 10, size=(6,))) ohe_label = features.OneHotLabel(torch.zeros(6, 10).scatter_(1, label.unsqueeze(1), 1)) masks = make_detection_mask((32, 24), num_objects=6) @@ -1329,12 +1329,12 @@ def test__transform(self, mocker): class TestScaleJitter: def test__get_params(self, mocker): - image_size = (24, 32) + spatial_size = (24, 32) target_size = (16, 12) scale_range = (0.5, 1.5) transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range) - sample = mocker.MagicMock(spec=features.Image, num_channels=3, image_size=image_size) + sample = mocker.MagicMock(spec=features.Image, num_channels=3, spatial_size=spatial_size) n_samples = 5 for _ in range(n_samples): @@ -1347,11 +1347,11 @@ def test__get_params(self, mocker): assert isinstance(size, tuple) and len(size) == 2 height, width = size - r_min = min(target_size[1] / image_size[0], target_size[0] / image_size[1]) * scale_range[0] - r_max = min(target_size[1] / image_size[0], target_size[0] / image_size[1]) * scale_range[1] + r_min = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[0] + r_max = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[1] - assert int(image_size[0] * r_min) <= height <= int(image_size[0] * r_max) - assert int(image_size[1] * r_min) <= width <= int(image_size[1] * r_max) + assert int(spatial_size[0] * r_min) <= height <= int(spatial_size[0] * r_max) + assert int(spatial_size[1] * r_min) <= width <= int(spatial_size[1] * r_max) def test__transform(self, mocker): interpolation_sentinel = mocker.MagicMock() @@ -1379,13 +1379,13 @@ def test__transform(self, mocker): class TestRandomShortestSize: def test__get_params(self, mocker): - image_size = (3, 10) + spatial_size = (3, 10) min_size = [5, 9] max_size = 20 transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size) - sample = mocker.MagicMock(spec=features.Image, num_channels=3, image_size=image_size) + sample = mocker.MagicMock(spec=features.Image, num_channels=3, spatial_size=spatial_size) params = transform._get_params(sample) assert "size" in params @@ -1504,7 +1504,7 @@ def test__copy_paste(self, label_type): labels = torch.nn.functional.one_hot(labels, num_classes=5) target = { "boxes": features.BoundingBox( - torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", image_size=(32, 32) + torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", spatial_size=(32, 32) ), "masks": features.Mask(masks), "labels": label_type(labels), @@ -1519,7 +1519,7 @@ def test__copy_paste(self, label_type): paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5) paste_target = { "boxes": features.BoundingBox( - torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", image_size=(32, 32) + torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", spatial_size=(32, 32) ), "masks": features.Mask(paste_masks), "labels": label_type(paste_labels), @@ -1550,14 +1550,14 @@ class TestFixedSizeCrop: def test__get_params(self, mocker): crop_size = (7, 7) batch_shape = (10,) - image_size = (11, 5) + spatial_size = (11, 5) transform = transforms.FixedSizeCrop(size=crop_size) sample = dict( - image=make_image(size=image_size, color_space=features.ColorSpace.RGB), + image=make_image(size=spatial_size, color_space=features.ColorSpace.RGB), bounding_boxes=make_bounding_box( - format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=batch_shape + format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape ), ) params = transform._get_params(sample) @@ -1638,7 +1638,7 @@ def test__transform(self, mocker, needs): def test__transform_culling(self, mocker): batch_size = 10 - image_size = (10, 10) + spatial_size = (10, 10) is_valid = torch.randint(0, 2, (batch_size,), dtype=torch.bool) mocker.patch( @@ -1647,17 +1647,17 @@ def test__transform_culling(self, mocker): needs_crop=True, top=0, left=0, - height=image_size[0], - width=image_size[1], + height=spatial_size[0], + width=spatial_size[1], is_valid=is_valid, needs_pad=False, ), ) bounding_boxes = make_bounding_box( - format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=(batch_size,) + format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,) ) - masks = make_detection_mask(size=image_size, extra_dims=(batch_size,)) + masks = make_detection_mask(size=spatial_size, extra_dims=(batch_size,)) labels = make_label(extra_dims=(batch_size,)) transform = transforms.FixedSizeCrop((-1, -1)) @@ -1678,7 +1678,7 @@ def test__transform_culling(self, mocker): def test__transform_bounding_box_clamping(self, mocker): batch_size = 3 - image_size = (10, 10) + spatial_size = (10, 10) mocker.patch( "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params", @@ -1686,15 +1686,15 @@ def test__transform_bounding_box_clamping(self, mocker): needs_crop=True, top=0, left=0, - height=image_size[0], - width=image_size[1], + height=spatial_size[0], + width=spatial_size[1], is_valid=torch.full((batch_size,), fill_value=True), needs_pad=False, ), ) bounding_box = make_bounding_box( - format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=(batch_size,) + format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,) ) mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_box") diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py index c8debe1e293..f335220fb29 100644 --- a/test/test_prototype_transforms_consistency.py +++ b/test/test_prototype_transforms_consistency.py @@ -24,7 +24,7 @@ from torchvision._utils import sequence_to_str from torchvision.prototype import features, transforms as prototype_transforms from torchvision.prototype.transforms import functional as F -from torchvision.prototype.transforms._utils import query_chw +from torchvision.prototype.transforms._utils import query_spatial_size from torchvision.prototype.transforms.functional import to_image_pil DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=[features.ColorSpace.RGB], extra_dims=[(4,)]) @@ -871,7 +871,7 @@ def make_datapoints(self, with_mask=True): pil_image = to_image_pil(make_image(size=size, color_space=features.ColorSpace.RGB)) target = { - "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), + "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), } if with_mask: @@ -881,7 +881,7 @@ def make_datapoints(self, with_mask=True): tensor_image = torch.Tensor(make_image(size=size, color_space=features.ColorSpace.RGB)) target = { - "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), + "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), } if with_mask: @@ -891,7 +891,7 @@ def make_datapoints(self, with_mask=True): feature_image = make_image(size=size, color_space=features.ColorSpace.RGB) target = { - "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), + "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), } if with_mask: @@ -949,7 +949,7 @@ def __init__(self, size, fill=0): self.fill = prototype_transforms._geometry._setup_fill_arg(fill) def _get_params(self, sample): - _, height, width = query_chw(sample) + height, width = query_spatial_size(sample) padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)] needs_padding = any(padding) return dict(padding=padding, needs_padding=needs_padding) diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py index 8329de69782..56c473a23e4 100644 --- a/test/test_prototype_transforms_functional.py +++ b/test/test_prototype_transforms_functional.py @@ -224,11 +224,14 @@ def test_scripted_smoke(self, info, args_kwargs, device): @pytest.mark.parametrize( "dispatcher", [ + F.clamp_bounding_box, F.convert_color_space, F.convert_image_dtype, F.get_dimensions, F.get_image_num_channels, F.get_image_size, + F.get_num_channels, + F.get_num_frames, F.get_spatial_size, F.rgb_to_grayscale, ], @@ -333,16 +336,16 @@ def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_): @pytest.mark.parametrize("device", cpu_and_gpu()) def test_correctness_affine_bounding_box_on_fixed_input(device): # Check transformation against known expected output - image_size = (64, 64) + spatial_size = (64, 64) # xyxy format in_boxes = [ [20, 25, 35, 45], [50, 5, 70, 22], - [image_size[1] // 2 - 10, image_size[0] // 2 - 10, image_size[1] // 2 + 10, image_size[0] // 2 + 10], + [spatial_size[1] // 2 - 10, spatial_size[0] // 2 - 10, spatial_size[1] // 2 + 10, spatial_size[0] // 2 + 10], [1, 1, 5, 5], ] in_boxes = features.BoundingBox( - in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, dtype=torch.float64, device=device + in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, dtype=torch.float64, device=device ) # Tested parameters angle = 63 @@ -355,9 +358,9 @@ def test_correctness_affine_bounding_box_on_fixed_input(device): # from albumentations.augmentations.geometric.functional import normalize_bbox, denormalize_bbox # expected_bboxes = [] # for in_box in in_boxes: - # n_in_box = normalize_bbox(in_box, *image_size) - # n_out_box = bbox_shift_scale_rotate(n_in_box, -angle, scale, dx, dy, *image_size) - # out_box = denormalize_bbox(n_out_box, *image_size) + # n_in_box = normalize_bbox(in_box, *spatial_size) + # n_out_box = bbox_shift_scale_rotate(n_in_box, -angle, scale, dx, dy, *spatial_size) + # out_box = denormalize_bbox(n_out_box, *spatial_size) # expected_bboxes.append(out_box) expected_bboxes = [ (24.522435977922218, 34.375689508290854, 46.443125279998114, 54.3516575015695), @@ -369,9 +372,9 @@ def test_correctness_affine_bounding_box_on_fixed_input(device): output_boxes = F.affine_bounding_box( in_boxes, in_boxes.format, - in_boxes.image_size, + in_boxes.spatial_size, angle, - (dx * image_size[1], dy * image_size[0]), + (dx * spatial_size[1], dy * spatial_size[0]), scale, shear=(0, 0), ) @@ -406,7 +409,7 @@ def _compute_expected_bbox(bbox, angle_, expand_, center_): affine_matrix = _compute_affine_matrix(angle_, [0.0, 0.0], 1.0, [0.0, 0.0], center_) affine_matrix = affine_matrix[:2, :] - height, width = bbox.image_size + height, width = bbox.spatial_size bbox_xyxy = convert_format_bounding_box( bbox, old_format=bbox.format, new_format=features.BoundingBoxFormat.XYXY ) @@ -444,7 +447,7 @@ def _compute_expected_bbox(bbox, angle_, expand_, center_): out_bbox = features.BoundingBox( out_bbox, format=features.BoundingBoxFormat.XYXY, - image_size=(height, width), + spatial_size=(height, width), dtype=bbox.dtype, device=bbox.device, ) @@ -455,16 +458,16 @@ def _compute_expected_bbox(bbox, angle_, expand_, center_): (height, width), ) - image_size = (32, 38) + spatial_size = (32, 38) - for bboxes in make_bounding_boxes(image_size=image_size, extra_dims=((4,),)): + for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)): bboxes_format = bboxes.format - bboxes_image_size = bboxes.image_size + bboxes_spatial_size = bboxes.spatial_size - output_bboxes, output_image_size = F.rotate_bounding_box( + output_bboxes, output_spatial_size = F.rotate_bounding_box( bboxes, bboxes_format, - image_size=bboxes_image_size, + spatial_size=bboxes_spatial_size, angle=angle, expand=expand, center=center, @@ -472,38 +475,38 @@ def _compute_expected_bbox(bbox, angle_, expand_, center_): center_ = center if center_ is None: - center_ = [s * 0.5 for s in bboxes_image_size[::-1]] + center_ = [s * 0.5 for s in bboxes_spatial_size[::-1]] if bboxes.ndim < 2: bboxes = [bboxes] expected_bboxes = [] for bbox in bboxes: - bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size) - expected_bbox, expected_image_size = _compute_expected_bbox(bbox, -angle, expand, center_) + bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size) + expected_bbox, expected_spatial_size = _compute_expected_bbox(bbox, -angle, expand, center_) expected_bboxes.append(expected_bbox) if len(expected_bboxes) > 1: expected_bboxes = torch.stack(expected_bboxes) else: expected_bboxes = expected_bboxes[0] torch.testing.assert_close(output_bboxes, expected_bboxes, atol=1, rtol=0) - torch.testing.assert_close(output_image_size, expected_image_size, atol=1, rtol=0) + torch.testing.assert_close(output_spatial_size, expected_spatial_size, atol=1, rtol=0) @pytest.mark.parametrize("device", cpu_and_gpu()) @pytest.mark.parametrize("expand", [False]) # expand=True does not match D2 def test_correctness_rotate_bounding_box_on_fixed_input(device, expand): # Check transformation against known expected output - image_size = (64, 64) + spatial_size = (64, 64) # xyxy format in_boxes = [ [1, 1, 5, 5], - [1, image_size[0] - 6, 5, image_size[0] - 2], - [image_size[1] - 6, image_size[0] - 6, image_size[1] - 2, image_size[0] - 2], - [image_size[1] // 2 - 10, image_size[0] // 2 - 10, image_size[1] // 2 + 10, image_size[0] // 2 + 10], + [1, spatial_size[0] - 6, 5, spatial_size[0] - 2], + [spatial_size[1] - 6, spatial_size[0] - 6, spatial_size[1] - 2, spatial_size[0] - 2], + [spatial_size[1] // 2 - 10, spatial_size[0] // 2 - 10, spatial_size[1] // 2 + 10, spatial_size[0] // 2 + 10], ] in_boxes = features.BoundingBox( - in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, dtype=torch.float64, device=device + in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, dtype=torch.float64, device=device ) # Tested parameters angle = 45 @@ -535,7 +538,7 @@ def test_correctness_rotate_bounding_box_on_fixed_input(device, expand): output_boxes, _ = F.rotate_bounding_box( in_boxes, in_boxes.format, - in_boxes.image_size, + in_boxes.spatial_size, angle, expand=expand, center=center, @@ -593,11 +596,11 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width, [50.0, 5.0, 70.0, 22.0], [45.0, 46.0, 56.0, 62.0], ] - in_boxes = features.BoundingBox(in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=size, device=device) + in_boxes = features.BoundingBox(in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=size, device=device) if format != features.BoundingBoxFormat.XYXY: in_boxes = convert_format_bounding_box(in_boxes, features.BoundingBoxFormat.XYXY, format) - output_boxes, output_image_size = F.crop_bounding_box( + output_boxes, output_spatial_size = F.crop_bounding_box( in_boxes, format, top, @@ -610,7 +613,7 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width, output_boxes = convert_format_bounding_box(output_boxes, format, features.BoundingBoxFormat.XYXY) torch.testing.assert_close(output_boxes.tolist(), expected_bboxes) - torch.testing.assert_close(output_image_size, size) + torch.testing.assert_close(output_spatial_size, size) @pytest.mark.parametrize("device", cpu_and_gpu()) @@ -658,7 +661,7 @@ def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_): bbox[3] = (bbox[3] - top_) * size_[0] / height_ return bbox - image_size = (100, 100) + spatial_size = (100, 100) # xyxy format in_boxes = [ [10.0, 10.0, 20.0, 20.0], @@ -670,18 +673,18 @@ def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_): expected_bboxes = torch.tensor(expected_bboxes, device=device) in_boxes = features.BoundingBox( - in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, device=device + in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, device=device ) if format != features.BoundingBoxFormat.XYXY: in_boxes = convert_format_bounding_box(in_boxes, features.BoundingBoxFormat.XYXY, format) - output_boxes, output_image_size = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size) + output_boxes, output_spatial_size = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size) if format != features.BoundingBoxFormat.XYXY: output_boxes = convert_format_bounding_box(output_boxes, format, features.BoundingBoxFormat.XYXY) torch.testing.assert_close(output_boxes, expected_bboxes) - torch.testing.assert_close(output_image_size, size) + torch.testing.assert_close(output_spatial_size, size) def _parse_padding(padding): @@ -718,28 +721,28 @@ def _compute_expected_bbox(bbox, padding_): bbox = bbox.to(bbox_dtype) return bbox - def _compute_expected_image_size(bbox, padding_): + def _compute_expected_spatial_size(bbox, padding_): pad_left, pad_up, pad_right, pad_down = _parse_padding(padding_) - height, width = bbox.image_size + height, width = bbox.spatial_size return height + pad_up + pad_down, width + pad_left + pad_right for bboxes in make_bounding_boxes(): bboxes = bboxes.to(device) bboxes_format = bboxes.format - bboxes_image_size = bboxes.image_size + bboxes_spatial_size = bboxes.spatial_size - output_boxes, output_image_size = F.pad_bounding_box( - bboxes, format=bboxes_format, image_size=bboxes_image_size, padding=padding + output_boxes, output_spatial_size = F.pad_bounding_box( + bboxes, format=bboxes_format, spatial_size=bboxes_spatial_size, padding=padding ) - torch.testing.assert_close(output_image_size, _compute_expected_image_size(bboxes, padding)) + torch.testing.assert_close(output_spatial_size, _compute_expected_spatial_size(bboxes, padding)) if bboxes.ndim < 2 or bboxes.shape[0] == 0: bboxes = [bboxes] expected_bboxes = [] for bbox in bboxes: - bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size) + bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size) expected_bboxes.append(_compute_expected_bbox(bbox, padding)) if len(expected_bboxes) > 1: @@ -807,7 +810,7 @@ def _compute_expected_bbox(bbox, pcoeffs_): out_bbox = features.BoundingBox( np.array(out_bbox), format=features.BoundingBoxFormat.XYXY, - image_size=bbox.image_size, + spatial_size=bbox.spatial_size, dtype=bbox.dtype, device=bbox.device, ) @@ -815,15 +818,15 @@ def _compute_expected_bbox(bbox, pcoeffs_): out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False ) - image_size = (32, 38) + spatial_size = (32, 38) pcoeffs = _get_perspective_coeffs(startpoints, endpoints) inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints) - for bboxes in make_bounding_boxes(image_size=image_size, extra_dims=((4,),)): + for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)): bboxes = bboxes.to(device) bboxes_format = bboxes.format - bboxes_image_size = bboxes.image_size + bboxes_spatial_size = bboxes.spatial_size output_bboxes = F.perspective_bounding_box( bboxes, @@ -836,7 +839,7 @@ def _compute_expected_bbox(bbox, pcoeffs_): expected_bboxes = [] for bbox in bboxes: - bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size) + bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size) expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs)) if len(expected_bboxes) > 1: expected_bboxes = torch.stack(expected_bboxes) @@ -853,14 +856,14 @@ def _compute_expected_bbox(bbox, pcoeffs_): def test_correctness_center_crop_bounding_box(device, output_size): def _compute_expected_bbox(bbox, output_size_): format_ = bbox.format - image_size_ = bbox.image_size + spatial_size_ = bbox.spatial_size bbox = convert_format_bounding_box(bbox, format_, features.BoundingBoxFormat.XYWH) if len(output_size_) == 1: output_size_.append(output_size_[-1]) - cy = int(round((image_size_[0] - output_size_[0]) * 0.5)) - cx = int(round((image_size_[1] - output_size_[1]) * 0.5)) + cy = int(round((spatial_size_[0] - output_size_[0]) * 0.5)) + cx = int(round((spatial_size_[1] - output_size_[1]) * 0.5)) out_bbox = [ bbox[0].item() - cx, bbox[1].item() - cy, @@ -870,7 +873,7 @@ def _compute_expected_bbox(bbox, output_size_): out_bbox = features.BoundingBox( out_bbox, format=features.BoundingBoxFormat.XYWH, - image_size=output_size_, + spatial_size=output_size_, dtype=bbox.dtype, device=bbox.device, ) @@ -879,10 +882,10 @@ def _compute_expected_bbox(bbox, output_size_): for bboxes in make_bounding_boxes(extra_dims=((4,),)): bboxes = bboxes.to(device) bboxes_format = bboxes.format - bboxes_image_size = bboxes.image_size + bboxes_spatial_size = bboxes.spatial_size - output_boxes, output_image_size = F.center_crop_bounding_box( - bboxes, bboxes_format, bboxes_image_size, output_size + output_boxes, output_spatial_size = F.center_crop_bounding_box( + bboxes, bboxes_format, bboxes_spatial_size, output_size ) if bboxes.ndim < 2: @@ -890,7 +893,7 @@ def _compute_expected_bbox(bbox, output_size_): expected_bboxes = [] for bbox in bboxes: - bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size) + bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size) expected_bboxes.append(_compute_expected_bbox(bbox, output_size)) if len(expected_bboxes) > 1: @@ -898,7 +901,7 @@ def _compute_expected_bbox(bbox, output_size_): else: expected_bboxes = expected_bboxes[0] torch.testing.assert_close(output_boxes, expected_bboxes) - torch.testing.assert_close(output_image_size, output_size) + torch.testing.assert_close(output_spatial_size, output_size) @pytest.mark.parametrize("device", cpu_and_gpu()) @@ -926,11 +929,11 @@ def _compute_expected_mask(mask, output_size): # Copied from test/test_functional_tensor.py @pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("image_size", ("small", "large")) +@pytest.mark.parametrize("spatial_size", ("small", "large")) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) @pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)]) @pytest.mark.parametrize("sigma", [[0.5, 0.5], (0.5, 0.5), (0.8, 0.8), (1.7, 1.7)]) -def test_correctness_gaussian_blur_image_tensor(device, image_size, dt, ksize, sigma): +def test_correctness_gaussian_blur_image_tensor(device, spatial_size, dt, ksize, sigma): fn = F.gaussian_blur_image_tensor # true_cv2_results = { @@ -950,7 +953,7 @@ def test_correctness_gaussian_blur_image_tensor(device, image_size, dt, ksize, s p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "gaussian_blur_opencv_results.pt") true_cv2_results = torch.load(p) - if image_size == "small": + if spatial_size == "small": tensor = ( torch.from_numpy(np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))).permute(2, 0, 1).to(device) ) diff --git a/test/test_prototype_transforms_utils.py b/test/test_prototype_transforms_utils.py index 9a8ed67dde2..3d5960c9625 100644 --- a/test/test_prototype_transforms_utils.py +++ b/test/test_prototype_transforms_utils.py @@ -11,8 +11,8 @@ IMAGE = make_image(color_space=features.ColorSpace.RGB) -BOUNDING_BOX = make_bounding_box(format=features.BoundingBoxFormat.XYXY, image_size=IMAGE.image_size) -MASK = make_detection_mask(size=IMAGE.image_size) +BOUNDING_BOX = make_bounding_box(format=features.BoundingBoxFormat.XYXY, spatial_size=IMAGE.spatial_size) +MASK = make_detection_mask(size=IMAGE.spatial_size) @pytest.mark.parametrize( diff --git a/torchvision/prototype/datasets/_builtin/caltech.py b/torchvision/prototype/datasets/_builtin/caltech.py index a00bf2e2cc9..29ed162ccea 100644 --- a/torchvision/prototype/datasets/_builtin/caltech.py +++ b/torchvision/prototype/datasets/_builtin/caltech.py @@ -110,7 +110,9 @@ def _prepare_sample( image=image, ann_path=ann_path, bounding_box=BoundingBox( - ann["box_coord"].astype(np.int64).squeeze()[[2, 0, 3, 1]], format="xyxy", image_size=image.image_size + ann["box_coord"].astype(np.int64).squeeze()[[2, 0, 3, 1]], + format="xyxy", + spatial_size=image.spatial_size, ), contour=_Feature(ann["obj_contour"].T), ) diff --git a/torchvision/prototype/datasets/_builtin/celeba.py b/torchvision/prototype/datasets/_builtin/celeba.py index a0a0218458b..3382b62b6ce 100644 --- a/torchvision/prototype/datasets/_builtin/celeba.py +++ b/torchvision/prototype/datasets/_builtin/celeba.py @@ -144,7 +144,7 @@ def _prepare_sample( bounding_box=BoundingBox( [int(bounding_box[key]) for key in ("x_1", "y_1", "width", "height")], format="xywh", - image_size=image.image_size, + spatial_size=image.spatial_size, ), landmarks={ landmark: _Feature((int(landmarks[f"{landmark}_x"]), int(landmarks[f"{landmark}_y"]))) diff --git a/torchvision/prototype/datasets/_builtin/coco.py b/torchvision/prototype/datasets/_builtin/coco.py index 16a16998bf7..72d76f48783 100644 --- a/torchvision/prototype/datasets/_builtin/coco.py +++ b/torchvision/prototype/datasets/_builtin/coco.py @@ -97,25 +97,29 @@ def _resources(self) -> List[OnlineResource]: ) return [images, meta] - def _segmentation_to_mask(self, segmentation: Any, *, is_crowd: bool, image_size: Tuple[int, int]) -> torch.Tensor: + def _segmentation_to_mask( + self, segmentation: Any, *, is_crowd: bool, spatial_size: Tuple[int, int] + ) -> torch.Tensor: from pycocotools import mask if is_crowd: - segmentation = mask.frPyObjects(segmentation, *image_size) + segmentation = mask.frPyObjects(segmentation, *spatial_size) else: - segmentation = mask.merge(mask.frPyObjects(segmentation, *image_size)) + segmentation = mask.merge(mask.frPyObjects(segmentation, *spatial_size)) return torch.from_numpy(mask.decode(segmentation)).to(torch.bool) def _decode_instances_anns(self, anns: List[Dict[str, Any]], image_meta: Dict[str, Any]) -> Dict[str, Any]: - image_size = (image_meta["height"], image_meta["width"]) + spatial_size = (image_meta["height"], image_meta["width"]) labels = [ann["category_id"] for ann in anns] return dict( # TODO: create a segmentation feature segmentations=_Feature( torch.stack( [ - self._segmentation_to_mask(ann["segmentation"], is_crowd=ann["iscrowd"], image_size=image_size) + self._segmentation_to_mask( + ann["segmentation"], is_crowd=ann["iscrowd"], spatial_size=spatial_size + ) for ann in anns ] ) @@ -125,7 +129,7 @@ def _decode_instances_anns(self, anns: List[Dict[str, Any]], image_meta: Dict[st bounding_boxes=BoundingBox( [ann["bbox"] for ann in anns], format="xywh", - image_size=image_size, + spatial_size=spatial_size, ), labels=Label(labels, categories=self._categories), super_categories=[self._category_to_super_category[self._categories[label]] for label in labels], diff --git a/torchvision/prototype/datasets/_builtin/cub200.py b/torchvision/prototype/datasets/_builtin/cub200.py index f1531615c23..9c32d96f960 100644 --- a/torchvision/prototype/datasets/_builtin/cub200.py +++ b/torchvision/prototype/datasets/_builtin/cub200.py @@ -130,13 +130,13 @@ def _2011_segmentation_key(self, data: Tuple[str, Any]) -> str: return path.with_suffix(".jpg").name def _2011_prepare_ann( - self, data: Tuple[str, Tuple[List[str], Tuple[str, BinaryIO]]], image_size: Tuple[int, int] + self, data: Tuple[str, Tuple[List[str], Tuple[str, BinaryIO]]], spatial_size: Tuple[int, int] ) -> Dict[str, Any]: _, (bounding_box_data, segmentation_data) = data segmentation_path, segmentation_buffer = segmentation_data return dict( bounding_box=BoundingBox( - [float(part) for part in bounding_box_data[1:]], format="xywh", image_size=image_size + [float(part) for part in bounding_box_data[1:]], format="xywh", spatial_size=spatial_size ), segmentation_path=segmentation_path, segmentation=EncodedImage.from_file(segmentation_buffer), @@ -149,7 +149,9 @@ def _2010_anns_key(self, data: Tuple[str, BinaryIO]) -> Tuple[str, Tuple[str, Bi path = pathlib.Path(data[0]) return path.with_suffix(".jpg").name, data - def _2010_prepare_ann(self, data: Tuple[str, Tuple[str, BinaryIO]], image_size: Tuple[int, int]) -> Dict[str, Any]: + def _2010_prepare_ann( + self, data: Tuple[str, Tuple[str, BinaryIO]], spatial_size: Tuple[int, int] + ) -> Dict[str, Any]: _, (path, buffer) = data content = read_mat(buffer) return dict( @@ -157,7 +159,7 @@ def _2010_prepare_ann(self, data: Tuple[str, Tuple[str, BinaryIO]], image_size: bounding_box=BoundingBox( [int(content["bbox"][coord]) for coord in ("left", "bottom", "right", "top")], format="xyxy", - image_size=image_size, + spatial_size=spatial_size, ), segmentation=_Feature(content["seg"]), ) @@ -175,7 +177,7 @@ def _prepare_sample( image = EncodedImage.from_file(buffer) return dict( - prepare_ann_fn(anns_data, image.image_size), + prepare_ann_fn(anns_data, image.spatial_size), image=image, label=Label( int(pathlib.Path(path).parent.name.rsplit(".", 1)[0]) - 1, diff --git a/torchvision/prototype/datasets/_builtin/gtsrb.py b/torchvision/prototype/datasets/_builtin/gtsrb.py index 8dc0a8240c8..e11dc2bb4ca 100644 --- a/torchvision/prototype/datasets/_builtin/gtsrb.py +++ b/torchvision/prototype/datasets/_builtin/gtsrb.py @@ -78,7 +78,7 @@ def _prepare_sample(self, data: Tuple[Tuple[str, Any], Dict[str, Any]]) -> Dict[ bounding_box = BoundingBox( [int(csv_info[k]) for k in ("Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2")], format="xyxy", - image_size=(int(csv_info["Height"]), int(csv_info["Width"])), + spatial_size=(int(csv_info["Height"]), int(csv_info["Width"])), ) return { diff --git a/torchvision/prototype/datasets/_builtin/stanford_cars.py b/torchvision/prototype/datasets/_builtin/stanford_cars.py index 011204f2bfb..a0e7a377e48 100644 --- a/torchvision/prototype/datasets/_builtin/stanford_cars.py +++ b/torchvision/prototype/datasets/_builtin/stanford_cars.py @@ -89,7 +89,7 @@ def _prepare_sample(self, data: Tuple[Tuple[str, BinaryIO], Tuple[int, int, int, path=path, image=image, label=Label(target[4] - 1, categories=self._categories), - bounding_box=BoundingBox(target[:4], format="xyxy", image_size=image.image_size), + bounding_box=BoundingBox(target[:4], format="xyxy", spatial_size=image.spatial_size), ) def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: diff --git a/torchvision/prototype/datasets/_builtin/voc.py b/torchvision/prototype/datasets/_builtin/voc.py index 84a9b3a7f51..8db82b4aac3 100644 --- a/torchvision/prototype/datasets/_builtin/voc.py +++ b/torchvision/prototype/datasets/_builtin/voc.py @@ -108,7 +108,7 @@ def _prepare_detection_ann(self, buffer: BinaryIO) -> Dict[str, Any]: for instance in instances ], format="xyxy", - image_size=cast(Tuple[int, int], tuple(int(anns["size"][dim]) for dim in ("height", "width"))), + spatial_size=cast(Tuple[int, int], tuple(int(anns["size"][dim]) for dim in ("height", "width"))), ), labels=Label( [self._categories.index(instance["name"]) for instance in instances], categories=self._categories diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py index 7b69af5f9bb..18c607d4d16 100644 --- a/torchvision/prototype/features/_bounding_box.py +++ b/torchvision/prototype/features/_bounding_box.py @@ -17,13 +17,13 @@ class BoundingBoxFormat(StrEnum): class BoundingBox(_Feature): format: BoundingBoxFormat - image_size: Tuple[int, int] + spatial_size: Tuple[int, int] @classmethod - def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, image_size: Tuple[int, int]) -> BoundingBox: + def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, spatial_size: Tuple[int, int]) -> BoundingBox: bounding_box = tensor.as_subclass(cls) bounding_box.format = format - bounding_box.image_size = image_size + bounding_box.spatial_size = spatial_size return bounding_box def __new__( @@ -31,7 +31,7 @@ def __new__( data: Any, *, format: Union[BoundingBoxFormat, str], - image_size: Tuple[int, int], + spatial_size: Tuple[int, int], dtype: Optional[torch.dtype] = None, device: Optional[Union[torch.device, str, int]] = None, requires_grad: bool = False, @@ -41,7 +41,7 @@ def __new__( if isinstance(format, str): format = BoundingBoxFormat.from_str(format.upper()) - return cls._wrap(tensor, format=format, image_size=image_size) + return cls._wrap(tensor, format=format, spatial_size=spatial_size) @classmethod def wrap_like( @@ -50,16 +50,16 @@ def wrap_like( tensor: torch.Tensor, *, format: Optional[BoundingBoxFormat] = None, - image_size: Optional[Tuple[int, int]] = None, + spatial_size: Optional[Tuple[int, int]] = None, ) -> BoundingBox: return cls._wrap( tensor, format=format if format is not None else other.format, - image_size=image_size if image_size is not None else other.image_size, + spatial_size=spatial_size if spatial_size is not None else other.spatial_size, ) def __repr__(self, *, tensor_contents: Any = None) -> str: # type: ignore[override] - return self._make_repr(format=self.format, image_size=self.image_size) + return self._make_repr(format=self.format, spatial_size=self.spatial_size) def to_format(self, format: Union[str, BoundingBoxFormat]) -> BoundingBox: if isinstance(format, str): @@ -70,11 +70,11 @@ def to_format(self, format: Union[str, BoundingBoxFormat]) -> BoundingBox: ) def horizontal_flip(self) -> BoundingBox: - output = self._F.horizontal_flip_bounding_box(self, format=self.format, image_size=self.image_size) + output = self._F.horizontal_flip_bounding_box(self, format=self.format, spatial_size=self.spatial_size) return BoundingBox.wrap_like(self, output) def vertical_flip(self) -> BoundingBox: - output = self._F.vertical_flip_bounding_box(self, format=self.format, image_size=self.image_size) + output = self._F.vertical_flip_bounding_box(self, format=self.format, spatial_size=self.spatial_size) return BoundingBox.wrap_like(self, output) def resize( # type: ignore[override] @@ -84,20 +84,22 @@ def resize( # type: ignore[override] max_size: Optional[int] = None, antialias: bool = False, ) -> BoundingBox: - output, image_size = self._F.resize_bounding_box(self, image_size=self.image_size, size=size, max_size=max_size) - return BoundingBox.wrap_like(self, output, image_size=image_size) + output, spatial_size = self._F.resize_bounding_box( + self, spatial_size=self.spatial_size, size=size, max_size=max_size + ) + return BoundingBox.wrap_like(self, output, spatial_size=spatial_size) def crop(self, top: int, left: int, height: int, width: int) -> BoundingBox: - output, image_size = self._F.crop_bounding_box( + output, spatial_size = self._F.crop_bounding_box( self, self.format, top=top, left=left, height=height, width=width ) - return BoundingBox.wrap_like(self, output, image_size=image_size) + return BoundingBox.wrap_like(self, output, spatial_size=spatial_size) def center_crop(self, output_size: List[int]) -> BoundingBox: - output, image_size = self._F.center_crop_bounding_box( - self, format=self.format, image_size=self.image_size, output_size=output_size + output, spatial_size = self._F.center_crop_bounding_box( + self, format=self.format, spatial_size=self.spatial_size, output_size=output_size ) - return BoundingBox.wrap_like(self, output, image_size=image_size) + return BoundingBox.wrap_like(self, output, spatial_size=spatial_size) def resized_crop( self, @@ -109,8 +111,8 @@ def resized_crop( interpolation: InterpolationMode = InterpolationMode.BILINEAR, antialias: bool = False, ) -> BoundingBox: - output, image_size = self._F.resized_crop_bounding_box(self, self.format, top, left, height, width, size=size) - return BoundingBox.wrap_like(self, output, image_size=image_size) + output, spatial_size = self._F.resized_crop_bounding_box(self, self.format, top, left, height, width, size=size) + return BoundingBox.wrap_like(self, output, spatial_size=spatial_size) def pad( self, @@ -118,10 +120,10 @@ def pad( fill: FillTypeJIT = None, padding_mode: str = "constant", ) -> BoundingBox: - output, image_size = self._F.pad_bounding_box( - self, format=self.format, image_size=self.image_size, padding=padding, padding_mode=padding_mode + output, spatial_size = self._F.pad_bounding_box( + self, format=self.format, spatial_size=self.spatial_size, padding=padding, padding_mode=padding_mode ) - return BoundingBox.wrap_like(self, output, image_size=image_size) + return BoundingBox.wrap_like(self, output, spatial_size=spatial_size) def rotate( self, @@ -131,10 +133,10 @@ def rotate( fill: FillTypeJIT = None, center: Optional[List[float]] = None, ) -> BoundingBox: - output, image_size = self._F.rotate_bounding_box( - self, format=self.format, image_size=self.image_size, angle=angle, expand=expand, center=center + output, spatial_size = self._F.rotate_bounding_box( + self, format=self.format, spatial_size=self.spatial_size, angle=angle, expand=expand, center=center ) - return BoundingBox.wrap_like(self, output, image_size=image_size) + return BoundingBox.wrap_like(self, output, spatial_size=spatial_size) def affine( self, @@ -149,7 +151,7 @@ def affine( output = self._F.affine_bounding_box( self, self.format, - self.image_size, + self.spatial_size, angle, translate=translate, scale=scale, diff --git a/torchvision/prototype/features/_encoded.py b/torchvision/prototype/features/_encoded.py index 4b963986b4f..9347b4eca6e 100644 --- a/torchvision/prototype/features/_encoded.py +++ b/torchvision/prototype/features/_encoded.py @@ -49,12 +49,12 @@ def from_path(cls: Type[D], path: Union[str, os.PathLike], **kwargs: Any) -> D: class EncodedImage(EncodedData): # TODO: Use @functools.cached_property if we can depend on Python 3.8 @property - def image_size(self) -> Tuple[int, int]: - if not hasattr(self, "_image_size"): + def spatial_size(self) -> Tuple[int, int]: + if not hasattr(self, "_spatial_size"): with PIL.Image.open(ReadOnlyTensorBuffer(self)) as image: - self._image_size = image.height, image.width + self._spatial_size = image.height, image.width - return self._image_size + return self._spatial_size class EncodedVideo(EncodedData): diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py index 23f81678d79..6d52a178b84 100644 --- a/torchvision/prototype/features/_image.py +++ b/torchvision/prototype/features/_image.py @@ -105,7 +105,7 @@ def __repr__(self, *, tensor_contents: Any = None) -> str: # type: ignore[overr return self._make_repr(color_space=self.color_space) @property - def image_size(self) -> Tuple[int, int]: + def spatial_size(self) -> Tuple[int, int]: return cast(Tuple[int, int], tuple(self.shape[-2:])) @property diff --git a/torchvision/prototype/features/_mask.py b/torchvision/prototype/features/_mask.py index 7b49ce8e85e..2da10195e80 100644 --- a/torchvision/prototype/features/_mask.py +++ b/torchvision/prototype/features/_mask.py @@ -33,7 +33,7 @@ def wrap_like( return cls._wrap(tensor) @property - def image_size(self) -> Tuple[int, int]: + def spatial_size(self) -> Tuple[int, int]: return cast(Tuple[int, int], tuple(self.shape[-2:])) def horizontal_flip(self) -> Mask: diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py index e32c36d5d9f..ca4253c73bb 100644 --- a/torchvision/prototype/features/_video.py +++ b/torchvision/prototype/features/_video.py @@ -54,9 +54,8 @@ def wrap_like(cls, other: Video, tensor: torch.Tensor, *, color_space: Optional[ def __repr__(self, *, tensor_contents: Any = None) -> str: # type: ignore[override] return self._make_repr(color_space=self.color_space) - # TODO: rename this (and all instances of this term to spatial size) @property - def image_size(self) -> Tuple[int, int]: + def spatial_size(self) -> Tuple[int, int]: return cast(Tuple[int, int], tuple(self.shape[-2:])) @property diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 4bfb5c9ed1e..f0e52738597 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -11,7 +11,7 @@ from torchvision.prototype.transforms import functional as F, InterpolationMode from ._transform import _RandomApplyTransform -from ._utils import has_any, query_chw +from ._utils import has_any, query_chw, query_spatial_size class RandomErasing(_RandomApplyTransform): @@ -153,7 +153,7 @@ class RandomCutmix(_BaseMixupCutmix): def _get_params(self, sample: Any) -> Dict[str, Any]: lam = float(self._dist.sample(())) - _, H, W = query_chw(sample) + H, W = query_spatial_size(sample) r_x = torch.randint(W, ()) r_y = torch.randint(H, ()) diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py index 340e721dab9..616669cc8a3 100644 --- a/torchvision/prototype/transforms/_color.py +++ b/torchvision/prototype/transforms/_color.py @@ -100,7 +100,7 @@ def __init__( self.p = p def _get_params(self, sample: Any) -> Dict[str, Any]: - num_channels, _, _ = query_chw(sample) + num_channels, *_ = query_chw(sample) return dict( zip( ["brightness", "contrast1", "saturation", "hue", "contrast2"], diff --git a/torchvision/prototype/transforms/_deprecated.py b/torchvision/prototype/transforms/_deprecated.py index f8aec22b96c..0cc4a90c42f 100644 --- a/torchvision/prototype/transforms/_deprecated.py +++ b/torchvision/prototype/transforms/_deprecated.py @@ -78,7 +78,7 @@ def __init__(self, p: float = 0.1) -> None: super().__init__(p=p) def _get_params(self, sample: Any) -> Dict[str, Any]: - num_input_channels, _, _ = query_chw(sample) + num_input_channels, *_ = query_chw(sample) return dict(num_input_channels=num_input_channels) def _transform(self, inpt: features.ImageOrVideoType, params: Dict[str, Any]) -> features.ImageOrVideoType: diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py index 371ea7f69c5..91d7c294ebc 100644 --- a/torchvision/prototype/transforms/_geometry.py +++ b/torchvision/prototype/transforms/_geometry.py @@ -24,7 +24,7 @@ has_all, has_any, query_bounding_box, - query_chw, + query_spatial_size, ) @@ -105,10 +105,7 @@ def __init__( self._log_ratio = torch.log(torch.tensor(self.ratio)) def _get_params(self, sample: Any) -> Dict[str, Any]: - # vfdev-5: techically, this op can work on bboxes/segm masks only inputs without image in samples - # What if we have multiple images/bboxes/masks of different sizes ? - # TODO: let's support bbox or mask in samples without image - _, height, width = query_chw(sample) + height, width = query_spatial_size(sample) area = height * width log_ratio = self._log_ratio @@ -263,7 +260,7 @@ def __init__( raise ValueError(f"Invalid canvas side range provided {side_range}.") def _get_params(self, sample: Any) -> Dict[str, Any]: - _, orig_h, orig_w = query_chw(sample) + orig_h, orig_w = query_spatial_size(sample) r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0]) canvas_width = int(orig_w * r) @@ -362,10 +359,7 @@ def __init__( self.center = center def _get_params(self, sample: Any) -> Dict[str, Any]: - - # Get image size - # TODO: make it work with bboxes and segm masks - _, height, width = query_chw(sample) + height, width = query_spatial_size(sample) angle = float(torch.empty(1).uniform_(float(self.degrees[0]), float(self.degrees[1])).item()) if self.translate is not None: @@ -427,7 +421,7 @@ def __init__( self.padding_mode = padding_mode def _get_params(self, sample: Any) -> Dict[str, Any]: - _, padded_height, padded_width = query_chw(sample) + padded_height, padded_width = query_spatial_size(sample) if self.padding is not None: pad_left, pad_right, pad_top, pad_bottom = self.padding @@ -515,9 +509,7 @@ def __init__( self.fill = _setup_fill_arg(fill) def _get_params(self, sample: Any) -> Dict[str, Any]: - # Get image size - # TODO: make it work with bboxes and segm masks - _, height, width = query_chw(sample) + height, width = query_spatial_size(sample) distortion_scale = self.distortion_scale @@ -571,9 +563,7 @@ def __init__( self.fill = _setup_fill_arg(fill) def _get_params(self, sample: Any) -> Dict[str, Any]: - # Get image size - # TODO: make it work with bboxes and segm masks - _, *size = query_chw(sample) + size = list(query_spatial_size(sample)) dx = torch.rand([1, 1] + size) * 2 - 1 if self.sigma[0] > 0.0: @@ -628,7 +618,7 @@ def __init__( self.trials = trials def _get_params(self, sample: Any) -> Dict[str, Any]: - _, orig_h, orig_w = query_chw(sample) + orig_h, orig_w = query_spatial_size(sample) bboxes = query_bounding_box(sample) while True: @@ -690,7 +680,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: if isinstance(output, features.BoundingBox): bboxes = output[is_within_crop_area] - bboxes = F.clamp_bounding_box(bboxes, output.format, output.image_size) + bboxes = F.clamp_bounding_box(bboxes, output.format, output.spatial_size) output = features.BoundingBox.wrap_like(output, bboxes) elif isinstance(output, features.Mask): # apply is_within_crop_area if mask is one-hot encoded @@ -727,7 +717,7 @@ def __init__( self.antialias = antialias def _get_params(self, sample: Any) -> Dict[str, Any]: - _, orig_height, orig_width = query_chw(sample) + orig_height, orig_width = query_spatial_size(sample) scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0]) r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale @@ -755,7 +745,7 @@ def __init__( self.antialias = antialias def _get_params(self, sample: Any) -> Dict[str, Any]: - _, orig_height, orig_width = query_chw(sample) + orig_height, orig_width = query_spatial_size(sample) min_size = self.min_size[int(torch.randint(len(self.min_size), ()))] r = min(min_size / min(orig_height, orig_width), self.max_size / max(orig_height, orig_width)) @@ -786,7 +776,7 @@ def __init__( self.padding_mode = padding_mode def _get_params(self, sample: Any) -> Dict[str, Any]: - _, height, width = query_chw(sample) + height, width = query_spatial_size(sample) new_height = min(height, self.crop_height) new_width = min(width, self.crop_width) @@ -811,7 +801,7 @@ def _get_params(self, sample: Any) -> Dict[str, Any]: bounding_boxes = features.BoundingBox.wrap_like( bounding_boxes, F.clamp_bounding_box( - bounding_boxes, format=bounding_boxes.format, image_size=bounding_boxes.image_size + bounding_boxes, format=bounding_boxes.format, spatial_size=bounding_boxes.spatial_size ), ) height_and_width = bounding_boxes.to_format(features.BoundingBoxFormat.XYWH)[..., 2:] @@ -851,7 +841,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: elif isinstance(inpt, features.BoundingBox): inpt = features.BoundingBox.wrap_like( inpt, - F.clamp_bounding_box(inpt[params["is_valid"]], format=inpt.format, image_size=inpt.image_size), + F.clamp_bounding_box(inpt[params["is_valid"]], format=inpt.format, spatial_size=inpt.spatial_size), ) if params["needs_pad"]: diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py index e5c7d05b017..dc109269f79 100644 --- a/torchvision/prototype/transforms/_meta.py +++ b/torchvision/prototype/transforms/_meta.py @@ -68,5 +68,5 @@ class ClampBoundingBoxes(Transform): _transformed_types = (features.BoundingBox,) def _transform(self, inpt: features.BoundingBox, params: Dict[str, Any]) -> features.BoundingBox: - output = F.clamp_bounding_box(inpt, format=inpt.format, image_size=inpt.image_size) + output = F.clamp_bounding_box(inpt, format=inpt.format, spatial_size=inpt.spatial_size) return features.BoundingBox.wrap_like(inpt, output) diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py index a3980fa2154..53b27f2e2a3 100644 --- a/torchvision/prototype/transforms/_utils.py +++ b/torchvision/prototype/transforms/_utils.py @@ -10,7 +10,7 @@ from torchvision.prototype import features from torchvision.prototype.features._feature import FillType -from torchvision.prototype.transforms.functional._meta import get_dimensions +from torchvision.prototype.transforms.functional._meta import get_dimensions, get_spatial_size from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size # noqa: F401 from typing_extensions import Literal @@ -98,6 +98,22 @@ def query_chw(sample: Any) -> Tuple[int, int, int]: return c, h, w +def query_spatial_size(sample: Any) -> Tuple[int, int]: + flat_sample, _ = tree_flatten(sample) + sizes = { + tuple(get_spatial_size(item)) + for item in flat_sample + if isinstance(item, (features.Image, PIL.Image.Image, features.Video, features.Mask, features.BoundingBox)) + or features.is_simple_tensor(item) + } + if not sizes: + raise TypeError("No image, video, mask or bounding box was found in the sample") + elif len(sizes) > 1: + raise ValueError(f"Found multiple HxW dimensions in the sample: {sequence_to_str(sorted(sizes))}") + h, w = sizes.pop() + return h, w + + def _isinstance(obj: Any, types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...]) -> bool: for type_or_check in types_or_checks: if isinstance(obj, type_or_check) if isinstance(type_or_check, type) else type_or_check(obj): diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py index 579442dc7b9..fb72e7b57a3 100644 --- a/torchvision/prototype/transforms/functional/__init__.py +++ b/torchvision/prototype/transforms/functional/__init__.py @@ -11,12 +11,18 @@ get_dimensions_image_tensor, get_dimensions_image_pil, get_dimensions, + get_num_frames_video, + get_num_frames, get_image_num_channels, get_num_channels_image_tensor, get_num_channels_image_pil, + get_num_channels_video, get_num_channels, + get_spatial_size_bounding_box, get_spatial_size_image_tensor, get_spatial_size_image_pil, + get_spatial_size_mask, + get_spatial_size_video, get_spatial_size, ) # usort: skip diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 44b4986aba0..590a13310a2 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -32,7 +32,7 @@ def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor: def horizontal_flip_bounding_box( - bounding_box: torch.Tensor, format: features.BoundingBoxFormat, image_size: Tuple[int, int] + bounding_box: torch.Tensor, format: features.BoundingBoxFormat, spatial_size: Tuple[int, int] ) -> torch.Tensor: shape = bounding_box.shape @@ -40,7 +40,7 @@ def horizontal_flip_bounding_box( bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY ).view(-1, 4) - bounding_box[:, [0, 2]] = image_size[1] - bounding_box[:, [2, 0]] + bounding_box[:, [0, 2]] = spatial_size[1] - bounding_box[:, [2, 0]] return convert_format_bounding_box( bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False @@ -69,7 +69,7 @@ def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor: def vertical_flip_bounding_box( - bounding_box: torch.Tensor, format: features.BoundingBoxFormat, image_size: Tuple[int, int] + bounding_box: torch.Tensor, format: features.BoundingBoxFormat, spatial_size: Tuple[int, int] ) -> torch.Tensor: shape = bounding_box.shape @@ -77,7 +77,7 @@ def vertical_flip_bounding_box( bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY ).view(-1, 4) - bounding_box[:, [1, 3]] = image_size[0] - bounding_box[:, [3, 1]] + bounding_box[:, [1, 3]] = spatial_size[0] - bounding_box[:, [3, 1]] return convert_format_bounding_box( bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False @@ -104,11 +104,11 @@ def vertical_flip(inpt: features.InputTypeJIT) -> features.InputTypeJIT: def _compute_resized_output_size( - image_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None + spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None ) -> List[int]: if isinstance(size, int): size = [size] - return __compute_resized_output_size(image_size, size=size, max_size=max_size) + return __compute_resized_output_size(spatial_size, size=size, max_size=max_size) def resize_image_tensor( @@ -162,10 +162,10 @@ def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = N def resize_bounding_box( - bounding_box: torch.Tensor, image_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None + bounding_box: torch.Tensor, spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None ) -> Tuple[torch.Tensor, Tuple[int, int]]: - old_height, old_width = image_size - new_height, new_width = _compute_resized_output_size(image_size, size=size, max_size=max_size) + old_height, old_width = spatial_size + new_height, new_width = _compute_resized_output_size(spatial_size, size=size, max_size=max_size) ratios = torch.tensor((new_width / old_width, new_height / old_height), device=bounding_box.device) return ( bounding_box.view(-1, 2, 2).mul(ratios).to(bounding_box.dtype).view(bounding_box.shape), @@ -312,7 +312,7 @@ def affine_image_pil( def _affine_bounding_box_xyxy( bounding_box: torch.Tensor, - image_size: Tuple[int, int], + spatial_size: Tuple[int, int], angle: Union[int, float], translate: List[float], scale: float, @@ -325,7 +325,7 @@ def _affine_bounding_box_xyxy( ) if center is None: - height, width = image_size + height, width = spatial_size center = [width * 0.5, height * 0.5] dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32 @@ -359,7 +359,7 @@ def _affine_bounding_box_xyxy( if expand: # Compute minimum point for transformed image frame: # Points are Top-Left, Top-Right, Bottom-Left, Bottom-Right points. - height, width = image_size + height, width = spatial_size points = torch.tensor( [ [0.0, 0.0, 1.0], @@ -378,15 +378,15 @@ def _affine_bounding_box_xyxy( # Estimate meta-data for image with inverted=True and with center=[0,0] affine_vector = _get_inverse_affine_matrix([0.0, 0.0], angle, translate, scale, shear) new_width, new_height = _FT._compute_affine_output_size(affine_vector, width, height) - image_size = (new_height, new_width) + spatial_size = (new_height, new_width) - return out_bboxes.to(bounding_box.dtype), image_size + return out_bboxes.to(bounding_box.dtype), spatial_size def affine_bounding_box( bounding_box: torch.Tensor, format: features.BoundingBoxFormat, - image_size: Tuple[int, int], + spatial_size: Tuple[int, int], angle: Union[int, float], translate: List[float], scale: float, @@ -398,7 +398,7 @@ def affine_bounding_box( bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY ).view(-1, 4) - out_bboxes, _ = _affine_bounding_box_xyxy(bounding_box, image_size, angle, translate, scale, shear, center) + out_bboxes, _ = _affine_bounding_box_xyxy(bounding_box, spatial_size, angle, translate, scale, shear, center) # out_bboxes should be of shape [N boxes, 4] @@ -573,7 +573,7 @@ def rotate_image_pil( def rotate_bounding_box( bounding_box: torch.Tensor, format: features.BoundingBoxFormat, - image_size: Tuple[int, int], + spatial_size: Tuple[int, int], angle: float, expand: bool = False, center: Optional[List[float]] = None, @@ -587,9 +587,9 @@ def rotate_bounding_box( bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY ).view(-1, 4) - out_bboxes, image_size = _affine_bounding_box_xyxy( + out_bboxes, spatial_size = _affine_bounding_box_xyxy( bounding_box, - image_size, + spatial_size, angle=-angle, translate=[0.0, 0.0], scale=1.0, @@ -602,7 +602,7 @@ def rotate_bounding_box( convert_format_bounding_box( out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False ).view(original_shape), - image_size, + spatial_size, ) @@ -756,7 +756,7 @@ def pad_mask( def pad_bounding_box( bounding_box: torch.Tensor, format: features.BoundingBoxFormat, - image_size: Tuple[int, int], + spatial_size: Tuple[int, int], padding: Union[int, List[int]], padding_mode: str = "constant", ) -> Tuple[torch.Tensor, Tuple[int, int]]: @@ -775,7 +775,7 @@ def pad_bounding_box( bounding_box[..., 2] += left bounding_box[..., 3] += top - height, width = image_size + height, width = spatial_size height += top + bottom width += left + right @@ -1066,10 +1066,10 @@ def elastic_bounding_box( ).view(-1, 4) # Question (vfdev-5): should we rely on good displacement shape and fetch image size from it - # Or add image_size arg and check displacement shape - image_size = displacement.shape[-3], displacement.shape[-2] + # Or add spatial_size arg and check displacement shape + spatial_size = displacement.shape[-3], displacement.shape[-2] - id_grid = _FT._create_identity_grid(list(image_size)).to(bounding_box.device) + id_grid = _FT._create_identity_grid(list(spatial_size)).to(bounding_box.device) # We construct an approximation of inverse grid as inv_grid = id_grid - displacement # This is not an exact inverse of the grid inv_grid = id_grid - displacement @@ -1079,7 +1079,7 @@ def elastic_bounding_box( index_x = torch.floor(points[:, 0] + 0.5).to(dtype=torch.long) index_y = torch.floor(points[:, 1] + 0.5).to(dtype=torch.long) # Transform points: - t_size = torch.tensor(image_size[::-1], device=displacement.device, dtype=displacement.dtype) + t_size = torch.tensor(spatial_size[::-1], device=displacement.device, dtype=displacement.dtype) transformed_points = (inv_grid[0, index_y, index_x, :] + 1) * 0.5 * t_size - 0.5 transformed_points = transformed_points.view(-1, 4, 2) @@ -1199,11 +1199,11 @@ def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL def center_crop_bounding_box( bounding_box: torch.Tensor, format: features.BoundingBoxFormat, - image_size: Tuple[int, int], + spatial_size: Tuple[int, int], output_size: List[int], ) -> Tuple[torch.Tensor, Tuple[int, int]]: crop_height, crop_width = _center_crop_parse_output_size(output_size) - crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *image_size) + crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *spatial_size) return crop_bounding_box(bounding_box, format, top=crop_top, left=crop_left, height=crop_height, width=crop_width) diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py index c03d65c951b..a118784ebeb 100644 --- a/torchvision/prototype/transforms/functional/_meta.py +++ b/torchvision/prototype/transforms/functional/_meta.py @@ -18,7 +18,7 @@ def get_dimensions(image: features.ImageOrVideoTypeJIT) -> List[int]: return get_dimensions_image_tensor(image) elif isinstance(image, (features.Image, features.Video)): channels = image.num_channels - height, width = image.image_size + height, width = image.spatial_size return [channels, height, width] else: return get_dimensions_image_pil(image) @@ -28,6 +28,10 @@ def get_dimensions(image: features.ImageOrVideoTypeJIT) -> List[int]: get_num_channels_image_pil = _FP.get_image_num_channels +def get_num_channels_video(video: torch.Tensor) -> int: + return get_num_channels_image_tensor(video) + + def get_num_channels(image: features.ImageOrVideoTypeJIT) -> int: if isinstance(image, torch.Tensor) and ( torch.jit.is_scripting() or not isinstance(image, (features.Image, features.Video)) @@ -55,21 +59,39 @@ def get_spatial_size_image_pil(image: PIL.Image.Image) -> List[int]: return [height, width] -# TODO: Should we have get_spatial_size_video here? How about masks/bbox etc? What is the criterion for deciding when -# a kernel will be created? +def get_spatial_size_video(video: torch.Tensor) -> List[int]: + return get_spatial_size_image_tensor(video) + + +def get_spatial_size_mask(mask: torch.Tensor) -> List[int]: + return get_spatial_size_image_tensor(mask) + + +@torch.jit.unused +def get_spatial_size_bounding_box(bounding_box: features.BoundingBox) -> List[int]: + return list(bounding_box.spatial_size) def get_spatial_size(inpt: features.InputTypeJIT) -> List[int]: if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)): return get_spatial_size_image_tensor(inpt) - elif isinstance(inpt, features._Feature): - image_size = getattr(inpt, "image_size", None) - if image_size is not None: - return list(image_size) - else: - raise ValueError(f"Type {inpt.__class__} doesn't have spatial size.") + elif isinstance(inpt, (features.Image, features.Video, features.BoundingBox, features.Mask)): + return list(inpt.spatial_size) + else: + return get_spatial_size_image_pil(inpt) # type: ignore[no-any-return] + + +def get_num_frames_video(video: torch.Tensor) -> int: + return video.shape[-4] + + +def get_num_frames(inpt: features.VideoTypeJIT) -> int: + if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features.Video)): + return get_num_frames_video(inpt) + elif isinstance(inpt, features.Video): + return inpt.num_frames else: - return get_spatial_size_image_pil(inpt) + raise TypeError(f"The video should be a Tensor. Got {type(inpt)}") def _xywh_to_xyxy(xywh: torch.Tensor) -> torch.Tensor: @@ -125,13 +147,13 @@ def convert_format_bounding_box( def clamp_bounding_box( - bounding_box: torch.Tensor, format: BoundingBoxFormat, image_size: Tuple[int, int] + bounding_box: torch.Tensor, format: BoundingBoxFormat, spatial_size: Tuple[int, int] ) -> torch.Tensor: # TODO: (PERF) Possible speed up clamping if we have different implementations for each bbox format. # Not sure if they yield equivalent results. xyxy_boxes = convert_format_bounding_box(bounding_box, format, BoundingBoxFormat.XYXY) - xyxy_boxes[..., 0::2].clamp_(min=0, max=image_size[1]) - xyxy_boxes[..., 1::2].clamp_(min=0, max=image_size[0]) + xyxy_boxes[..., 0::2].clamp_(min=0, max=spatial_size[1]) + xyxy_boxes[..., 1::2].clamp_(min=0, max=spatial_size[0]) return convert_format_bounding_box(xyxy_boxes, BoundingBoxFormat.XYXY, format, copy=False)