diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py index fbf66b80efe..cd5cdc69836 100644 --- a/torchvision/prototype/features/_bounding_box.py +++ b/torchvision/prototype/features/_bounding_box.py @@ -64,7 +64,7 @@ def to_format(self, format: Union[str, BoundingBoxFormat]) -> BoundingBox: from torchvision.prototype.transforms.functional import convert_bounding_box_format if isinstance(format, str): - format = BoundingBoxFormat[format] + format = BoundingBoxFormat.from_str(format.upper()) return BoundingBox.new_like( self, convert_bounding_box_format(self, old_format=self.format, new_format=format), format=format diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py index 16369428e47..2b52a253820 100644 --- a/torchvision/prototype/transforms/__init__.py +++ b/torchvision/prototype/transforms/__init__.py @@ -7,7 +7,16 @@ from ._augment import RandomErasing, RandomMixup, RandomCutmix from ._auto_augment import RandAugment, TrivialAugmentWide, AutoAugment, AugMix from ._container import Compose, RandomApply, RandomChoice, RandomOrder -from ._geometry import HorizontalFlip, Resize, CenterCrop, RandomResizedCrop, FiveCrop, TenCrop, BatchMultiCrop +from ._geometry import ( + HorizontalFlip, + Resize, + CenterCrop, + RandomResizedCrop, + FiveCrop, + TenCrop, + BatchMultiCrop, + RandomZoomOut, +) from ._meta import ConvertBoundingBoxFormat, ConvertImageDtype, ConvertImageColorSpace from ._misc import Identity, Normalize, ToDtype, Lambda from ._presets import ( diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py index e04e9f819f3..2a965959629 100644 --- a/torchvision/prototype/transforms/_geometry.py +++ b/torchvision/prototype/transforms/_geometry.py @@ -256,3 +256,88 @@ def apply_recursively(obj: Any) -> Any: return obj return apply_recursively(inputs if len(inputs) > 1 else inputs[0]) + + +class RandomZoomOut(Transform): + def __init__( + self, fill: Union[float, Sequence[float]] = 0.0, side_range: Tuple[float, float] = (1.0, 4.0), p: float = 0.5 + ) -> None: + super().__init__() + + if fill is None: + fill = 0.0 + self.fill = fill + + self.side_range = side_range + if side_range[0] < 1.0 or side_range[0] > side_range[1]: + raise ValueError(f"Invalid canvas side range provided {side_range}.") + + self.p = p + + def _get_params(self, sample: Any) -> Dict[str, Any]: + image = query_image(sample) + orig_c, orig_h, orig_w = get_image_dimensions(image) + + r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0]) + canvas_width = int(orig_w * r) + canvas_height = int(orig_h * r) + + r = torch.rand(2) + left = int((canvas_width - orig_w) * r[0]) + top = int((canvas_height - orig_h) * r[1]) + right = canvas_width - (left + orig_w) + bottom = canvas_height - (top + orig_h) + padding = [left, top, right, bottom] + + fill = self.fill + if not isinstance(fill, collections.abc.Sequence): + fill = [fill] * orig_c + + return dict(padding=padding, fill=fill) + + def _transform(self, input: Any, params: Dict[str, Any]) -> Any: + if isinstance(input, features.Image) or is_simple_tensor(input): + # PyTorch's pad supports only integers on fill. So we need to overwrite the colour + output = F.pad_image_tensor(input, params["padding"], fill=0, padding_mode="constant") + + left, top, right, bottom = params["padding"] + fill = torch.tensor(params["fill"], dtype=input.dtype, device=input.device).to().view(-1, 1, 1) + + if top > 0: + output[..., :top, :] = fill + if left > 0: + output[..., :, :left] = fill + if bottom > 0: + output[..., -bottom:, :] = fill + if right > 0: + output[..., :, -right:] = fill + + if isinstance(input, features.Image): + output = features.Image.new_like(input, output) + + return output + elif isinstance(input, PIL.Image.Image): + return F.pad_image_pil( + input, + params["padding"], + fill=tuple(int(v) if input.mode != "F" else v for v in params["fill"]), + padding_mode="constant", + ) + elif isinstance(input, features.BoundingBox): + output = F.pad_bounding_box(input, params["padding"], format=input.format) + + left, top, right, bottom = params["padding"] + height, width = input.image_size + height += top + bottom + width += left + right + + return features.BoundingBox.new_like(input, output, image_size=(height, width)) + else: + return input + + def forward(self, *inputs: Any) -> Any: + sample = inputs if len(inputs) > 1 else inputs[0] + if torch.rand(1) >= self.p: + return sample + + return super().forward(sample) diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py index c0825784f66..ed6e9989328 100644 --- a/torchvision/prototype/transforms/functional/__init__.py +++ b/torchvision/prototype/transforms/functional/__init__.py @@ -54,6 +54,7 @@ rotate_image_pil, pad_image_tensor, pad_image_pil, + pad_bounding_box, crop_image_tensor, crop_image_pil, perspective_image_tensor, diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 84d1fe963c9..1bff7a3f2e6 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -27,7 +27,7 @@ def horizontal_flip_bounding_box( bounding_box[:, [0, 2]] = image_size[1] - bounding_box[:, [2, 0]] return convert_bounding_box_format( - bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format + bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False ).view(shape) @@ -210,6 +210,26 @@ def rotate_image_pil( pad_image_tensor = _FT.pad pad_image_pil = _FP.pad + +def pad_bounding_box( + bounding_box: torch.Tensor, padding: List[int], format: features.BoundingBoxFormat +) -> torch.Tensor: + left, _, top, _ = _FT._parse_pad_padding(padding) + + shape = bounding_box.shape + + bounding_box = convert_bounding_box_format( + bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY + ).view(-1, 4) + + bounding_box[:, 0::2] += left + bounding_box[:, 1::2] += top + + return convert_bounding_box_format( + bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False + ).view(shape) + + crop_image_tensor = _FT.crop crop_image_pil = _FP.crop diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py index be38b687515..2386f47b226 100644 --- a/torchvision/prototype/transforms/functional/_meta.py +++ b/torchvision/prototype/transforms/functional/_meta.py @@ -40,10 +40,13 @@ def _xyxy_to_cxcywh(xyxy: torch.Tensor) -> torch.Tensor: def convert_bounding_box_format( - bounding_box: torch.Tensor, *, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat + bounding_box: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, copy: bool = True ) -> torch.Tensor: if new_format == old_format: - return bounding_box.clone() + if copy: + return bounding_box.clone() + else: + return bounding_box if old_format == BoundingBoxFormat.XYWH: bounding_box = _xywh_to_xyxy(bounding_box) @@ -89,10 +92,13 @@ def _gray_to_rgb(grayscale: torch.Tensor) -> torch.Tensor: def convert_image_color_space_tensor( - image: torch.Tensor, old_color_space: ColorSpace, new_color_space: ColorSpace + image: torch.Tensor, old_color_space: ColorSpace, new_color_space: ColorSpace, copy: bool = True ) -> torch.Tensor: if new_color_space == old_color_space: - return image.clone() + if copy: + return image.clone() + else: + return image if old_color_space == ColorSpace.OTHER or new_color_space == ColorSpace.OTHER: raise RuntimeError(f"Conversion to or from {ColorSpace.OTHER} is not supported.") @@ -135,11 +141,16 @@ def convert_image_color_space_tensor( } -def convert_image_color_space_pil(image: PIL.Image.Image, color_space: ColorSpace) -> PIL.Image.Image: +def convert_image_color_space_pil( + image: PIL.Image.Image, color_space: ColorSpace, copy: bool = True +) -> PIL.Image.Image: old_mode = image.mode try: new_mode = _COLOR_SPACE_TO_PIL_MODE[color_space] except KeyError: raise ValueError(f"Conversion from {ColorSpace.from_pil_mode(old_mode)} to {color_space} is not supported.") + if not copy and image.mode == new_mode: + return image + return image.convert(new_mode) diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py index d8671405b96..fba2ca1cad4 100644 --- a/torchvision/transforms/functional_tensor.py +++ b/torchvision/transforms/functional_tensor.py @@ -350,6 +350,26 @@ def _pad_symmetric(img: Tensor, padding: List[int]) -> Tensor: raise RuntimeError("Symmetric padding of N-D tensors are not supported yet") +def _parse_pad_padding(padding: List[int]) -> List[int]: + if isinstance(padding, int): + if torch.jit.is_scripting(): + # This maybe unreachable + raise ValueError("padding can't be an int while torchscripting, set it as a list [value, ]") + pad_left = pad_right = pad_top = pad_bottom = padding + elif len(padding) == 1: + pad_left = pad_right = pad_top = pad_bottom = padding[0] + elif len(padding) == 2: + pad_left = pad_right = padding[0] + pad_top = pad_bottom = padding[1] + else: + pad_left = padding[0] + pad_top = padding[1] + pad_right = padding[2] + pad_bottom = padding[3] + + return [pad_left, pad_right, pad_top, pad_bottom] + + def pad(img: Tensor, padding: List[int], fill: int = 0, padding_mode: str = "constant") -> Tensor: _assert_image_tensor(img) @@ -369,23 +389,7 @@ def pad(img: Tensor, padding: List[int], fill: int = 0, padding_mode: str = "con if padding_mode not in ["constant", "edge", "reflect", "symmetric"]: raise ValueError("Padding mode should be either constant, edge, reflect or symmetric") - if isinstance(padding, int): - if torch.jit.is_scripting(): - # This maybe unreachable - raise ValueError("padding can't be an int while torchscripting, set it as a list [value, ]") - pad_left = pad_right = pad_top = pad_bottom = padding - elif len(padding) == 1: - pad_left = pad_right = pad_top = pad_bottom = padding[0] - elif len(padding) == 2: - pad_left = pad_right = padding[0] - pad_top = pad_bottom = padding[1] - else: - pad_left = padding[0] - pad_top = padding[1] - pad_right = padding[2] - pad_bottom = padding[3] - - p = [pad_left, pad_right, pad_top, pad_bottom] + p = _parse_pad_padding(padding) if padding_mode == "edge": # remap padding_mode str