diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml index 19521cdd011..8a9f7d33b49 100644 --- a/.github/workflows/test-linux-cpu.yml +++ b/.github/workflows/test-linux-cpu.yml @@ -39,7 +39,7 @@ jobs: fi # Create Conda Env - conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy + conda create -yp ci_env --quiet python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3' conda activate /work/ci_env # Install PyTorch, Torchvision, and testing libraries @@ -50,8 +50,8 @@ jobs: -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ "${CUDATOOLKIT}" python3 setup.py develop - python3 -m pip install pytest pytest-mock 'av<10' + python3 -m pip install --progress-bar=off pytest pytest-mock 'av<10' # Run Tests python3 -m torch.utils.collect_env - python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20 + python3 -m pytest --junitxml=test-results/junit.xml --durations 20 diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml index 831de27e350..d1275071bf7 100644 --- a/.github/workflows/test-linux-gpu.yml +++ b/.github/workflows/test-linux-gpu.yml @@ -43,7 +43,7 @@ jobs: fi # Create Conda Env - conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy + conda create -yp ci_env --quiet python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3' conda activate /work/ci_env # Install PyTorch, Torchvision, and testing libraries @@ -54,8 +54,8 @@ jobs: -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ "${CUDATOOLKIT}" python3 setup.py develop - python3 -m pip install pytest pytest-mock 'av<10' + python3 -m pip install --progress-bar=off pytest pytest-mock 'av<10' # Run Tests python3 -m torch.utils.collect_env - python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20 + python3 -m pytest --junitxml=test-results/junit.xml --durations 20 diff --git a/docs/requirements.txt b/docs/requirements.txt index 09a11359ae7..2a50d9b8f45 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,3 +5,4 @@ sphinx-gallery>=0.11.1 sphinx==5.0.0 tabulate -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme +pycocotools diff --git a/docs/source/beta_status.py b/docs/source/beta_status.py index 925894df5c5..4a0fdc72c0f 100644 --- a/docs/source/beta_status.py +++ b/docs/source/beta_status.py @@ -4,15 +4,26 @@ class BetaStatus(Directive): has_content = True + text = "The {api_name} is in Beta stage, and backward compatibility is not guaranteed." def run(self): - api_name = " ".join(self.content) - text = f"The {api_name} is in Beta stage, and backward compatibility is not guaranteed." + text = self.text.format(api_name=" ".join(self.content)) return [nodes.warning("", nodes.paragraph("", "", nodes.Text(text)))] +class V2BetaStatus(BetaStatus): + text = ( + "The {api_name} is in Beta stage, and while we do not expect major breaking changes, " + "some APIs may still change according to user feedback. Please submit any feedback you may have " + "in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check " + "out https://github.com/pytorch/vision/issues/7319 to learn " + "more about the APIs that we suspect might involve future changes." + ) + + def setup(app): app.add_directive("betastatus", BetaStatus) + app.add_directive("v2betastatus", V2BetaStatus) return { "version": "0.1", "parallel_read_safe": True, diff --git a/docs/source/conf.py b/docs/source/conf.py index 72c83d7893d..6d748f5b717 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -33,6 +33,9 @@ sys.path.append(os.path.abspath(".")) +torchvision.disable_beta_transforms_warning() +import torchvision.datapoints # Don't remove, otherwise the docs for datapoints aren't linked properly + # -- General configuration ------------------------------------------------ # Required version of sphinx is set from docs/requirements.txt @@ -60,6 +63,7 @@ "gallery_dirs": "auto_examples", # path to where to save gallery generated output "backreferences_dir": "gen_modules/backreferences", "doc_module": ("torchvision",), + "remove_config_comments": True, } napoleon_use_ivar = True diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst new file mode 100644 index 00000000000..1cc62413e66 --- /dev/null +++ b/docs/source/datapoints.rst @@ -0,0 +1,19 @@ +Datapoints +========== + +.. currentmodule:: torchvision.datapoints + +Datapoints are tensor subclasses which the :mod:`~torchvision.transforms.v2` v2 transforms use under the hood to +dispatch their inputs to the appropriate lower-level kernels. Most users do not +need to manipulate datapoints directly and can simply rely on dataset wrapping - +see e.g. :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`. + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + Image + Video + BoundingBoxFormat + BoundingBox + Mask diff --git a/docs/source/index.rst b/docs/source/index.rst index 79dbebdd047..bc38fdb0307 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -32,6 +32,7 @@ architectures, and common image transformations for computer vision. :caption: Package Reference transforms + datapoints models datasets utils diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 5909b68966b..0d6961bbe79 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -5,6 +5,22 @@ Transforming and augmenting images .. currentmodule:: torchvision.transforms + +.. note:: + In 0.15, we released a new set of transforms available in the + ``torchvision.transforms.v2`` namespace, which add support for transforming + not just images but also bounding boxes, masks, or videos. These transforms + are fully backward compatible with the current ones, and you'll see them + documented below with a `v2.` prefix. To get started with those new + transforms, you can check out + :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`. + Note that these transforms are still BETA, and while we don't expect major + breaking changes in the future, some APIs may still change according to user + feedback. Please submit any feedback you may have `here + `_, and you can also check + out `this issue `_ to learn + more about the APIs that we suspect might involve future changes. + Transforms are common image transformations available in the ``torchvision.transforms`` module. They can be chained together using :class:`Compose`. @@ -14,11 +30,10 @@ transformations. This is useful if you have to build a more complex transformation pipeline (e.g. in the case of segmentation tasks). -Most transformations accept both `PIL `_ -images and tensor images, although some transformations are :ref:`PIL-only -` and some are :ref:`tensor-only -`. The :ref:`conversion_transforms` may be used to -convert to and from PIL images. +Most transformations accept both `PIL `_ images +and tensor images, although some transformations are PIL-only and some are +tensor-only. The :ref:`conversion_transforms` may be used to convert to and from +PIL images, or for converting dtypes and ranges. The transformations that accept tensor images also accept batches of tensor images. A Tensor Image is a tensor with ``(C, H, W)`` shape, where ``C`` is a @@ -70,8 +85,10 @@ The following examples illustrate the use of the available transforms: produce the same results. -Scriptable transforms ---------------------- +Transforms scriptability +------------------------ + +.. TODO: Add note about v2 scriptability (in next PR) In order to script the transformations, please use ``torch.nn.Sequential`` instead of :class:`Compose`. @@ -89,100 +106,141 @@ Make sure to use only scriptable transformations, i.e. that work with ``torch.Te For any custom transformations to be used with ``torch.jit.script``, they should be derived from ``torch.nn.Module``. -Compositions of transforms --------------------------- +Geometry +-------- .. autosummary:: :toctree: generated/ :template: class.rst - Compose + Resize + v2.Resize + v2.ScaleJitter + v2.RandomShortestSize + v2.RandomResize + RandomCrop + v2.RandomCrop + RandomResizedCrop + v2.RandomResizedCrop + v2.RandomIoUCrop + CenterCrop + v2.CenterCrop + FiveCrop + v2.FiveCrop + TenCrop + v2.TenCrop + Pad + v2.Pad + v2.RandomZoomOut + RandomRotation + v2.RandomRotation + RandomAffine + v2.RandomAffine + RandomPerspective + v2.RandomPerspective + ElasticTransform + v2.ElasticTransform + RandomHorizontalFlip + v2.RandomHorizontalFlip + RandomVerticalFlip + v2.RandomVerticalFlip -Transforms on PIL Image and torch.\*Tensor ------------------------------------------- +Color +----- .. autosummary:: :toctree: generated/ :template: class.rst - CenterCrop ColorJitter - FiveCrop + v2.ColorJitter + v2.RandomPhotometricDistort Grayscale - Pad - RandomAffine - RandomApply - RandomCrop + v2.Grayscale RandomGrayscale - RandomHorizontalFlip - RandomPerspective - RandomResizedCrop - RandomRotation - RandomVerticalFlip - Resize - TenCrop + v2.RandomGrayscale GaussianBlur + v2.GaussianBlur RandomInvert + v2.RandomInvert RandomPosterize + v2.RandomPosterize RandomSolarize + v2.RandomSolarize RandomAdjustSharpness + v2.RandomAdjustSharpness RandomAutocontrast + v2.RandomAutocontrast RandomEqualize + v2.RandomEqualize - -.. _transforms_pil_only: - -Transforms on PIL Image only ----------------------------- +Composition +----------- .. autosummary:: :toctree: generated/ :template: class.rst + Compose + v2.Compose + RandomApply + v2.RandomApply RandomChoice + v2.RandomChoice RandomOrder + v2.RandomOrder -.. _transforms_tensor_only: - -Transforms on torch.\*Tensor only ---------------------------------- +Miscellaneous +------------- .. autosummary:: :toctree: generated/ :template: class.rst LinearTransformation + v2.LinearTransformation Normalize + v2.Normalize RandomErasing - ConvertImageDtype + v2.RandomErasing + Lambda + v2.Lambda + v2.SanitizeBoundingBox + v2.ClampBoundingBox + v2.UniformTemporalSubsample .. _conversion_transforms: -Conversion Transforms ---------------------- +Conversion +---------- +.. note:: + Beware, some of these conversion transforms below will scale the values + while performing the conversion, while some may not do any scaling. By + scaling, we mean e.g. that a ``uint8`` -> ``float32`` would map the [0, + 255] range into [0, 1] (and vice-versa). + .. autosummary:: :toctree: generated/ :template: class.rst ToPILImage + v2.ToPILImage + v2.ToImagePIL ToTensor + v2.ToTensor PILToTensor + v2.PILToTensor + v2.ToImageTensor + ConvertImageDtype + v2.ConvertDtype + v2.ConvertImageDtype + v2.ToDtype + v2.ConvertBoundingBoxFormat - -Generic Transforms ------------------- - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - Lambda - - -Automatic Augmentation Transforms ---------------------------------- +Auto-Augmentation +----------------- `AutoAugment `_ is a common Data Augmentation technique that can improve the accuracy of Image Classification models. Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that @@ -196,9 +254,13 @@ The new transform can be used standalone or mixed-and-matched with existing tran AutoAugmentPolicy AutoAugment + v2.AutoAugment RandAugment + v2.RandAugment TrivialAugmentWide + v2.TrivialAugmentWide AugMix + v2.AugMix .. _functional_transforms: @@ -207,6 +269,14 @@ Functional Transforms .. currentmodule:: torchvision.transforms.functional + +.. note:: + You'll find below the documentation for the existing + ``torchvision.transforms.functional`` namespace. The + ``torchvision.transforms.v2.functional`` namespace exists as well and can be + used! The same functionals are present, so you simply need to change your + import to rely on the ``v2`` namespace. + Functional transforms give you fine-grained control of the transformation pipeline. As opposed to the transformations above, functional transforms don't contain a random number generator for their parameters. diff --git a/gallery/assets/coco/images/000000000001.jpg b/gallery/assets/coco/images/000000000001.jpg new file mode 120000 index 00000000000..9be80c7c273 --- /dev/null +++ b/gallery/assets/coco/images/000000000001.jpg @@ -0,0 +1 @@ +../../astronaut.jpg \ No newline at end of file diff --git a/gallery/assets/coco/images/000000000002.jpg b/gallery/assets/coco/images/000000000002.jpg new file mode 120000 index 00000000000..9f8efef9928 --- /dev/null +++ b/gallery/assets/coco/images/000000000002.jpg @@ -0,0 +1 @@ +../../dog2.jpg \ No newline at end of file diff --git a/gallery/assets/coco/instances.json b/gallery/assets/coco/instances.json new file mode 100644 index 00000000000..fe0e09270bf --- /dev/null +++ b/gallery/assets/coco/instances.json @@ -0,0 +1 @@ +{"images": [{"file_name": "000000000001.jpg", "height": 512, "width": 512, "id": 1}, {"file_name": "000000000002.jpg", "height": 500, "width": 500, "id": 2}], "annotations": [{"segmentation": [[40.0, 511.0, 26.0, 487.0, 28.0, 438.0, 17.0, 397.0, 24.0, 346.0, 38.0, 306.0, 61.0, 250.0, 111.0, 206.0, 111.0, 187.0, 120.0, 183.0, 136.0, 159.0, 159.0, 150.0, 181.0, 148.0, 182.0, 132.0, 175.0, 132.0, 168.0, 120.0, 154.0, 102.0, 153.0, 62.0, 188.0, 35.0, 191.0, 29.0, 208.0, 20.0, 210.0, 22.0, 227.0, 16.0, 240.0, 16.0, 276.0, 31.0, 285.0, 39.0, 301.0, 88.0, 297.0, 108.0, 281.0, 128.0, 273.0, 138.0, 266.0, 138.0, 264.0, 153.0, 257.0, 162.0, 256.0, 174.0, 284.0, 197.0, 300.0, 221.0, 303.0, 236.0, 337.0, 258.0, 357.0, 306.0, 361.0, 351.0, 358.0, 511.0]], "iscrowd": 0, "image_id": 1, "bbox": [17.0, 16.0, 344.0, 495.0], "category_id": 1, "id": 1}, {"segmentation": [[0.0, 411.0, 43.0, 401.0, 99.0, 395.0, 105.0, 351.0, 124.0, 326.0, 181.0, 294.0, 227.0, 280.0, 245.0, 262.0, 259.0, 234.0, 262.0, 207.0, 271.0, 140.0, 283.0, 139.0, 301.0, 162.0, 309.0, 181.0, 341.0, 175.0, 362.0, 139.0, 369.0, 139.0, 377.0, 163.0, 378.0, 203.0, 381.0, 212.0, 380.0, 220.0, 382.0, 242.0, 404.0, 264.0, 392.0, 293.0, 384.0, 295.0, 385.0, 316.0, 399.0, 343.0, 391.0, 448.0, 452.0, 475.0, 457.0, 494.0, 436.0, 498.0, 402.0, 491.0, 369.0, 488.0, 366.0, 496.0, 319.0, 496.0, 302.0, 485.0, 226.0, 469.0, 128.0, 456.0, 74.0, 458.0, 29.0, 439.0, 0.0, 445.0]], "iscrowd": 0, "image_id": 2, "bbox": [0.0, 139.0, 457.0, 359.0], "category_id": 18, "id": 2}]} diff --git a/gallery/plot_datapoints.py b/gallery/plot_datapoints.py new file mode 100644 index 00000000000..83ca6793598 --- /dev/null +++ b/gallery/plot_datapoints.py @@ -0,0 +1,132 @@ +""" +============== +Datapoints FAQ +============== + +The :mod:`torchvision.datapoints` namespace was introduced together with ``torchvision.transforms.v2``. This example +showcases what these datapoints are and how they behave. This is a fairly low-level topic that most users will not need +to worry about: you do not need to understand the internals of datapoints to efficiently rely on +``torchvision.transforms.v2``. It may however be useful for advanced users trying to implement their own datasets, +transforms, or work directly with the datapoints. +""" + +import PIL.Image + +import torch +import torchvision + +# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that +# some APIs may slightly change in the future +torchvision.disable_beta_transforms_warning() + +from torchvision import datapoints + + +######################################################################################################################## +# What are datapoints? +# -------------------- +# +# Datapoints are zero-copy tensor subclasses: + +tensor = torch.rand(3, 256, 256) +image = datapoints.Image(tensor) + +assert isinstance(image, torch.Tensor) +assert image.data_ptr() == tensor.data_ptr() + + +######################################################################################################################## +# Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function +# for the input data. +# +# What datapoints are supported? +# ------------------------------ +# +# So far :mod:`torchvision.datapoints` supports four types of datapoints: +# +# * :class:`~torchvision.datapoints.Image` +# * :class:`~torchvision.datapoints.Video` +# * :class:`~torchvision.datapoints.BoundingBox` +# * :class:`~torchvision.datapoints.Mask` +# +# How do I construct a datapoint? +# ------------------------------- +# +# Each datapoint class takes any tensor-like data that can be turned into a :class:`~torch.Tensor` + +image = datapoints.Image([[[[0, 1], [1, 0]]]]) +print(image) + + +######################################################################################################################## +# Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad`` +# parameters. + +float_image = datapoints.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires_grad=True) +print(float_image) + + +######################################################################################################################## +# In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` also take a +# :class:`PIL.Image.Image` directly: + +image = datapoints.Image(PIL.Image.open("assets/astronaut.jpg")) +print(image.shape, image.dtype) + +######################################################################################################################## +# In general, the datapoints can also store additional metadata that complements the underlying tensor. For example, +# :class:`~torchvision.datapoints.BoundingBox` stores the coordinate format as well as the spatial size of the +# corresponding image alongside the actual values: + +bounding_box = datapoints.BoundingBox( + [17, 16, 344, 495], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:] +) +print(bounding_box) + + +######################################################################################################################## +# Do I have to wrap the output of the datasets myself? +# ---------------------------------------------------- +# +# Only if you are using custom datasets. For the built-in ones, you can use +# :func:`torchvision.datasets.wrap_dataset_for_transforms_v2`. Note that the function also supports subclasses of the +# built-in datasets. Meaning, if your custom dataset subclasses from a built-in one and the output type is the same, you +# also don't have to wrap manually. +# +# How do the datapoints behave inside a computation? +# -------------------------------------------------- +# +# Datapoints look and feel just like regular tensors. Everything that is supported on a plain :class:`torch.Tensor` +# also works on datapoints. +# Since for most operations involving datapoints, it cannot be safely inferred whether the result should retain the +# datapoint type, we choose to return a plain tensor instead of a datapoint (this might change, see note below): + +assert isinstance(image, datapoints.Image) + +new_image = image + 0 + +assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image) + +######################################################################################################################## +# .. note:: +# +# This "unwrapping" behaviour is something we're actively seeking feedback on. If you find this surprising or if you +# have any suggestions on how to better support your use-cases, please reach out to us via this issue: +# https://github.com/pytorch/vision/issues/7319 +# +# There are two exceptions to this rule: +# +# 1. The operations :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`, and :meth:`~torch.Tensor.requires_grad_` +# retain the datapoint type. +# 2. Inplace operations on datapoints cannot change the type of the datapoint they are called on. However, if you use +# the flow style, the returned value will be unwrapped: + +image = datapoints.Image([[[0, 1], [1, 0]]]) + +new_image = image.add_(1).mul_(2) + +assert isinstance(image, torch.Tensor) +print(image) + +assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image) +assert (new_image == image).all() diff --git a/gallery/plot_transforms_v2.py b/gallery/plot_transforms_v2.py new file mode 100644 index 00000000000..d1096bec1e7 --- /dev/null +++ b/gallery/plot_transforms_v2.py @@ -0,0 +1,109 @@ +""" +================================== +Getting started with transforms v2 +================================== + +Most computer vision tasks are not supported out of the box by ``torchvision.transforms`` v1, since it only supports +images. ``torchvision.transforms.v2`` enables jointly transforming images, videos, bounding boxes, and masks. This +example showcases the core functionality of the new ``torchvision.transforms.v2`` API. +""" + +import pathlib + +import torch +import torchvision + + +def load_data(): + from torchvision.io import read_image + from torchvision import datapoints + from torchvision.ops import masks_to_boxes + + assets_directory = pathlib.Path("assets") + + path = assets_directory / "FudanPed00054.png" + image = datapoints.Image(read_image(str(path))) + merged_masks = read_image(str(assets_directory / "FudanPed00054_mask.png")) + + labels = torch.unique(merged_masks)[1:] + + masks = datapoints.Mask(merged_masks == labels.view(-1, 1, 1)) + + bounding_boxes = datapoints.BoundingBox( + masks_to_boxes(masks), format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:] + ) + + return path, image, bounding_boxes, masks, labels + + +######################################################################################################################## +# The :mod:`torchvision.transforms.v2` API supports images, videos, bounding boxes, and instance and segmentation +# masks. Thus, it offers native support for many Computer Vision tasks, like image and video classification, object +# detection or instance and semantic segmentation. Still, the interface is the same, making +# :mod:`torchvision.transforms.v2` a drop-in replacement for the existing :mod:`torchvision.transforms` API, aka v1. + +# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that +# some APIs may slightly change in the future +torchvision.disable_beta_transforms_warning() +import torchvision.transforms.v2 as transforms + +transform = transforms.Compose( + [ + transforms.ColorJitter(contrast=0.5), + transforms.RandomRotation(30), + transforms.CenterCrop(480), + ] +) + +######################################################################################################################## +# :mod:`torchvision.transforms.v2` natively supports jointly transforming multiple inputs while making sure that +# potential random behavior is consistent across all inputs. However, it doesn't enforce a specific input structure or +# order. + +path, image, bounding_boxes, masks, labels = load_data() + +torch.manual_seed(0) +new_image = transform(image) # Image Classification +new_image, new_bounding_boxes, new_labels = transform(image, bounding_boxes, labels) # Object Detection +new_image, new_bounding_boxes, new_masks, new_labels = transform( + image, bounding_boxes, masks, labels +) # Instance Segmentation +new_image, new_target = transform((image, {"boxes": bounding_boxes, "labels": labels})) # Arbitrary Structure + +######################################################################################################################## +# Under the hood, :mod:`torchvision.transforms.v2` relies on :mod:`torchvision.datapoints` for the dispatch to the +# appropriate function for the input data: :ref:`sphx_glr_auto_examples_plot_datapoints.py`. Note however, that as +# regular user, you likely don't have to touch this yourself. See +# :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`. +# +# All "foreign" types like :class:`str`'s or :class:`pathlib.Path`'s are passed through, allowing to store extra +# information directly with the sample: + +sample = {"path": path, "image": image} +new_sample = transform(sample) + +assert new_sample["path"] is sample["path"] + +######################################################################################################################## +# As stated above, :mod:`torchvision.transforms.v2` is a drop-in replacement for :mod:`torchvision.transforms` and thus +# also supports transforming plain :class:`torch.Tensor`'s as image or video if applicable. This is achieved with a +# simple heuristic: +# +# * If we find an explicit image or video (:class:`torchvision.datapoints.Image`, :class:`torchvision.datapoints.Video`, +# or :class:`PIL.Image.Image`) in the input, all other plain tensors are passed through. +# * If there is no explicit image or video, only the first plain :class:`torch.Tensor` will be transformed as image or +# video, while all others will be passed through. + +plain_tensor_image = torch.rand(image.shape) + +print(image.shape, plain_tensor_image.shape) + +# passing a plain tensor together with an explicit image, will not transform the former +plain_tensor_image, image = transform(plain_tensor_image, image) + +print(image.shape, plain_tensor_image.shape) + +# passing a plain tensor without an explicit image, will transform the former +plain_tensor_image, _ = transform(plain_tensor_image, bounding_boxes) + +print(image.shape, plain_tensor_image.shape) diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py new file mode 100644 index 00000000000..aa25d214f31 --- /dev/null +++ b/gallery/plot_transforms_v2_e2e.py @@ -0,0 +1,152 @@ +""" +================================================== +Transforms v2: End-to-end object detection example +================================================== + +Object detection is not supported out of the box by ``torchvision.transforms`` v1, since it only supports images. +``torchvision.transforms.v2`` enables jointly transforming images, videos, bounding boxes, and masks. This example +showcases an end-to-end object detection training using the stable ``torchvisio.datasets`` and ``torchvision.models`` as +well as the new ``torchvision.transforms.v2`` v2 API. +""" + +import pathlib +from collections import defaultdict + +import PIL.Image + +import torch +import torch.utils.data + +import torchvision + + +def show(sample): + import matplotlib.pyplot as plt + + from torchvision.transforms.v2 import functional as F + from torchvision.utils import draw_bounding_boxes + + image, target = sample + if isinstance(image, PIL.Image.Image): + image = F.to_image_tensor(image) + image = F.convert_dtype(image, torch.uint8) + annotated_image = draw_bounding_boxes(image, target["boxes"], colors="yellow", width=3) + + fig, ax = plt.subplots() + ax.imshow(annotated_image.permute(1, 2, 0).numpy()) + ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) + fig.tight_layout() + + fig.show() + + +# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that +# some APIs may slightly change in the future +torchvision.disable_beta_transforms_warning() + +from torchvision import models, datasets +import torchvision.transforms.v2 as transforms + + +######################################################################################################################## +# We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently +# returns, and we'll see how to convert it to a format that is compatible with our new transforms. + + +def load_example_coco_detection_dataset(**kwargs): + # This loads fake data for illustration purposes of this example. In practice, you'll have + # to replace this with the proper data + root = pathlib.Path("assets") / "coco" + return datasets.CocoDetection(str(root / "images"), str(root / "instances.json"), **kwargs) + + +dataset = load_example_coco_detection_dataset() + +sample = dataset[0] +image, target = sample +print(type(image)) +print(type(target), type(target[0]), list(target[0].keys())) + + +######################################################################################################################## +# The dataset returns a two-tuple with the first item being a :class:`PIL.Image.Image` and second one a list of +# dictionaries, which each containing the annotations for a single object instance. As is, this format is not compatible +# with the ``torchvision.transforms.v2``, nor with the models. To overcome that, we provide the +# :func:`~torchvision.datasets.wrap_dataset_for_transforms_v2` function. For +# :class:`~torchvision.datasets.CocoDetection`, this changes the target structure to a single dictionary of lists. It +# also adds the key-value-pairs ``"boxes"``, ``"masks"``, and ``"labels"`` wrapped in the corresponding +# ``torchvision.datapoints``. + +dataset = datasets.wrap_dataset_for_transforms_v2(dataset) + +sample = dataset[0] +image, target = sample +print(type(image)) +print(type(target), list(target.keys())) +print(type(target["boxes"]), type(target["masks"]), type(target["labels"])) + +######################################################################################################################## +# As baseline, let's have a look at a sample without transformations: + +show(sample) + + +######################################################################################################################## +# With the dataset properly set up, we can now define the augmentation pipeline. This is done the same way it is done in +# ``torchvision.transforms`` v1, but now handles bounding boxes and masks without any extra configuration. + +transform = transforms.Compose( + [ + transforms.RandomPhotometricDistort(), + transforms.RandomZoomOut( + fill=defaultdict(lambda: 0, {PIL.Image.Image: (123, 117, 104)}) + ), + transforms.RandomIoUCrop(), + transforms.RandomHorizontalFlip(), + transforms.ToImageTensor(), + transforms.ConvertImageDtype(torch.float32), + transforms.SanitizeBoundingBox(), + ] +) + +######################################################################################################################## +# .. note:: +# Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBox` transform is a no-op in this example, but it +# should be placed at least once at the end of a detection pipeline to remove degenerate bounding boxes as well as +# the corresponding labels and optionally masks. It is particularly critical to add it if +# :class:`~torchvision.transforms.v2.RandomIoUCrop` was used. +# +# Let's look how the sample looks like with our augmentation pipeline in place: + +dataset = load_example_coco_detection_dataset(transforms=transform) +dataset = datasets.wrap_dataset_for_transforms_v2(dataset) + +torch.manual_seed(3141) +sample = dataset[0] + +# sphinx_gallery_thumbnail_number = 2 +show(sample) + + +######################################################################################################################## +# We can see that the color of the image was distorted, we zoomed out on it (off center) and flipped it horizontally. +# In all of this, the bounding box was transformed accordingly. And without any further ado, we can start training. + +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=2, + # We need a custom collation function here, since the object detection models expect a + # sequence of images and target dictionaries. The default collation function tries to + # `torch.stack` the individual elements, which fails in general for object detection, + # because the number of object instances varies between the samples. This is the same for + # `torchvision.transforms` v1 + collate_fn=lambda batch: tuple(zip(*batch)), +) + +model = models.get_model("ssd300_vgg16", weights=None, weights_backbone=None).train() + +for images, targets in data_loader: + loss_dict = model(images, targets) + print(loss_dict) + # Put your training logic here + break diff --git a/test/test_datapoints.py b/test/test_datapoints.py index 5b875a6ef20..39c05123333 100644 --- a/test/test_datapoints.py +++ b/test/test_datapoints.py @@ -28,5 +28,5 @@ def test_bbox_instance(data, format): assert isinstance(bboxes, torch.Tensor) assert bboxes.ndim == 2 and bboxes.shape[1] == 4 if isinstance(format, str): - format = datapoints.BoundingBoxFormat.from_str(format.upper()) + format = datapoints.BoundingBoxFormat[(format.upper())] assert bboxes.format == format diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 2e43c86f91d..f5ca976963a 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -136,14 +136,14 @@ class TestSmoke: (transforms.RandomCrop([16, 16], pad_if_needed=True), None), (transforms.RandomHorizontalFlip(p=1.0), None), (transforms.RandomPerspective(p=1.0), None), - (transforms.RandomResize(min_size=10, max_size=20), None), - (transforms.RandomResizedCrop([16, 16]), None), + (transforms.RandomResize(min_size=10, max_size=20, antialias=True), None), + (transforms.RandomResizedCrop([16, 16], antialias=True), None), (transforms.RandomRotation(degrees=30), None), - (transforms.RandomShortestSize(min_size=10), None), + (transforms.RandomShortestSize(min_size=10, antialias=True), None), (transforms.RandomVerticalFlip(p=1.0), None), (transforms.RandomZoomOut(p=1.0), None), (transforms.Resize([16, 16], antialias=True), None), - (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2)), None), + (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2), antialias=True), None), (transforms.ClampBoundingBox(), None), (transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None), (transforms.ConvertDtype(), None), @@ -275,7 +275,7 @@ def test_common(self, transform, adapter, container_type, image_or_video, device boxes=datapoints.BoundingBox([[0, 0, 0, 0]], format=format, spatial_size=(224, 244)), labels=torch.tensor([3]), ) - assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4) + assert transforms.SanitizeBoundingBox()(sample)["boxes"].shape == (0, 4) @parametrize( [ @@ -1359,11 +1359,8 @@ def test_ctor(self, transform_cls, trfms): class TestRandomChoice: def test_assertions(self): - with pytest.warns(UserWarning, match="Argument p is deprecated and will be removed"): - transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1, 2]) - - with pytest.raises(ValueError, match="The number of probabilities doesn't match the number of transforms"): - transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], probabilities=[1]) + with pytest.raises(ValueError, match="Length of p doesn't match the number of transforms"): + transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1]) class TestRandomIoUCrop: @@ -1517,7 +1514,7 @@ class TestRandomShortestSize: def test__get_params(self, min_size, max_size, mocker): spatial_size = (3, 10) - transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size) + transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size, antialias=True) sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size) params = transform._get_params([sample]) @@ -1598,7 +1595,7 @@ def test__get_params(self): min_size = 3 max_size = 6 - transform = transforms.RandomResize(min_size=min_size, max_size=max_size) + transform = transforms.RandomResize(min_size=min_size, max_size=max_size, antialias=True) for _ in range(10): params = transform._get_params([]) @@ -1794,15 +1791,21 @@ def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor): else: sample = image, label + if to_tensor is transforms.ToTensor: + with pytest.warns(UserWarning, match="deprecated and will be removed"): + to_tensor = to_tensor() + else: + to_tensor = to_tensor() + t = transforms.Compose( [ - transforms.RandomResizedCrop((224, 224)), + transforms.RandomResizedCrop((224, 224), antialias=True), transforms.RandomHorizontalFlip(p=1), transforms.RandAugment(), transforms.TrivialAugmentWide(), transforms.AugMix(), transforms.AutoAugment(), - to_tensor(), + to_tensor, # TODO: ConvertImageDtype is a pass-through on PIL images, is that # intended? This results in a failure if we convert to tensor after # it, because the image would still be uint8 which make Normalize @@ -1833,10 +1836,17 @@ def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor): @pytest.mark.parametrize("sanitize", (True, False)) def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): torch.manual_seed(0) + + if to_tensor is transforms.ToTensor: + with pytest.warns(UserWarning, match="deprecated and will be removed"): + to_tensor = to_tensor() + else: + to_tensor = to_tensor() + if data_augmentation == "hflip": t = [ transforms.RandomHorizontalFlip(p=1), - to_tensor(), + to_tensor, transforms.ConvertImageDtype(torch.float), ] elif data_augmentation == "lsj": @@ -1850,7 +1860,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): # ), transforms.RandomCrop((1024, 1024), pad_if_needed=True), transforms.RandomHorizontalFlip(p=1), - to_tensor(), + to_tensor, transforms.ConvertImageDtype(torch.float), ] elif data_augmentation == "multiscale": @@ -1859,7 +1869,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333, antialias=True ), transforms.RandomHorizontalFlip(p=1), - to_tensor(), + to_tensor, transforms.ConvertImageDtype(torch.float), ] elif data_augmentation == "ssd": @@ -1868,18 +1878,18 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): transforms.RandomZoomOut(fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})), transforms.RandomIoUCrop(), transforms.RandomHorizontalFlip(p=1), - to_tensor(), + to_tensor, transforms.ConvertImageDtype(torch.float), ] elif data_augmentation == "ssdlite": t = [ transforms.RandomIoUCrop(), transforms.RandomHorizontalFlip(p=1), - to_tensor(), + to_tensor, transforms.ConvertImageDtype(torch.float), ] if sanitize: - t += [transforms.SanitizeBoundingBoxes()] + t += [transforms.SanitizeBoundingBox()] t = transforms.Compose(t) num_boxes = 5 @@ -1910,7 +1920,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): out = t(sample) - if to_tensor is transforms.ToTensor and image_type is not datapoints.Image: + if isinstance(to_tensor, transforms.ToTensor) and image_type is not datapoints.Image: assert is_simple_tensor(out["image"]) else: assert isinstance(out["image"], datapoints.Image) @@ -1920,7 +1930,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): # ssd and ssdlite contain RandomIoUCrop which may "remove" some bbox. It # doesn't remove them strictly speaking, it just marks some boxes as # degenerate and those boxes will be later removed by - # SanitizeBoundingBoxes(), which we add to the pipelines if the sanitize + # SanitizeBoundingBox(), which we add to the pipelines if the sanitize # param is True. # Note that the values below are probably specific to the random seed # set above (which is fine). @@ -1935,7 +1945,14 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): @pytest.mark.parametrize( "labels_getter", ("default", "labels", lambda inputs: inputs["labels"], None, lambda inputs: None) ) -def test_sanitize_bounding_boxes(min_size, labels_getter): +@pytest.mark.parametrize("sample_type", (tuple, dict)) +def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type): + + if sample_type is tuple and not isinstance(labels_getter, str): + # The "lambda inputs: inputs["labels"]" labels_getter used in this test + # doesn't work if the input is a tuple. + return + H, W = 256, 128 boxes_and_validity = [ @@ -1970,41 +1987,62 @@ def test_sanitize_bounding_boxes(min_size, labels_getter): ) masks = datapoints.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W))) - + whatever = torch.rand(10) + input_img = torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8) sample = { - "image": torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8), + "image": input_img, "labels": labels, "boxes": boxes, - "whatever": torch.rand(10), + "whatever": whatever, "None": None, "masks": masks, } - out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample) + if sample_type is tuple: + img = sample.pop("image") + sample = (img, sample) + + out = transforms.SanitizeBoundingBox(min_size=min_size, labels_getter=labels_getter)(sample) + + if sample_type is tuple: + out_image = out[0] + out_labels = out[1]["labels"] + out_boxes = out[1]["boxes"] + out_masks = out[1]["masks"] + out_whatever = out[1]["whatever"] + else: + out_image = out["image"] + out_labels = out["labels"] + out_boxes = out["boxes"] + out_masks = out["masks"] + out_whatever = out["whatever"] - assert out["image"] is sample["image"] - assert out["whatever"] is sample["whatever"] + assert out_image is input_img + assert out_whatever is whatever if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None): - assert out["labels"] is sample["labels"] + assert out_labels is labels else: - assert isinstance(out["labels"], torch.Tensor) - assert out["boxes"].shape[0] == out["labels"].shape[0] == out["masks"].shape[0] + assert isinstance(out_labels, torch.Tensor) + assert out_boxes.shape[0] == out_labels.shape[0] == out_masks.shape[0] # This works because we conveniently set labels to arange(num_boxes) - assert out["labels"].tolist() == valid_indices + assert out_labels.tolist() == valid_indices @pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT")) -def test_sanitize_bounding_boxes_default_heuristic(key): +@pytest.mark.parametrize("sample_type", (tuple, dict)) +def test_sanitize_bounding_boxes_default_heuristic(key, sample_type): labels = torch.arange(10) - d = {key: labels} - assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels + sample = {key: labels, "another_key": "whatever"} + if sample_type is tuple: + sample = (None, sample, "whatever_again") + assert transforms.SanitizeBoundingBox._find_labels_default_heuristic(sample) is labels if key.lower() != "labels": # If "labels" is in the dict (case-insensitive), # it takes precedence over other keys which would otherwise be a match d = {key: "something_else", "labels": labels} - assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels + assert transforms.SanitizeBoundingBox._find_labels_default_heuristic(d) is labels def test_sanitize_bounding_boxes_errors(): @@ -2016,25 +2054,25 @@ def test_sanitize_bounding_boxes_errors(): ) with pytest.raises(ValueError, match="min_size must be >= 1"): - transforms.SanitizeBoundingBoxes(min_size=0) + transforms.SanitizeBoundingBox(min_size=0) with pytest.raises(ValueError, match="labels_getter should either be a str"): - transforms.SanitizeBoundingBoxes(labels_getter=12) + transforms.SanitizeBoundingBox(labels_getter=12) with pytest.raises(ValueError, match="Could not infer where the labels are"): bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])} - transforms.SanitizeBoundingBoxes()(bad_labels_key) + transforms.SanitizeBoundingBox()(bad_labels_key) with pytest.raises(ValueError, match="If labels_getter is a str or 'default'"): not_a_dict = (good_bbox, torch.arange(good_bbox.shape[0])) - transforms.SanitizeBoundingBoxes()(not_a_dict) + transforms.SanitizeBoundingBox()(not_a_dict) with pytest.raises(ValueError, match="must be a tensor"): not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()} - transforms.SanitizeBoundingBoxes()(not_a_tensor) + transforms.SanitizeBoundingBox()(not_a_tensor) with pytest.raises(ValueError, match="Number of boxes"): different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)} - transforms.SanitizeBoundingBoxes()(different_sizes) + transforms.SanitizeBoundingBox()(different_sizes) with pytest.raises(ValueError, match="boxes must be of shape"): bad_bbox = datapoints.BoundingBox( # batch with 2 elements @@ -2046,7 +2084,7 @@ def test_sanitize_bounding_boxes_errors(): spatial_size=(20, 20), ) different_sizes = {"bbox": bad_bbox, "labels": torch.arange(bad_bbox.shape[0])} - transforms.SanitizeBoundingBoxes()(different_sizes) + transforms.SanitizeBoundingBox()(different_sizes) @pytest.mark.parametrize( diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py index 125d7ec7a3f..a8a87cd43dd 100644 --- a/test/test_transforms_v2_consistency.py +++ b/test/test_transforms_v2_consistency.py @@ -540,9 +540,12 @@ def test_signature_consistency(config): f"not. Please add a default value." ) - legacy_kinds = {name: param.kind for name, param in legacy_params.items()} - prototype_kinds = {name: prototype_params[name].kind for name in legacy_kinds.keys()} - assert prototype_kinds == legacy_kinds + legacy_signature = list(legacy_params.keys()) + # Since we made sure that we don't have any extra parameters without default above, we clamp the prototype signature + # to the same number of parameters as the legacy one + prototype_signature = list(prototype_params.keys())[: len(legacy_signature)] + + assert prototype_signature == legacy_signature def check_call_consistency( @@ -819,7 +822,7 @@ def test_random_choice(self, probabilities): v2_transforms.Resize(256), legacy_transforms.CenterCrop(224), ], - probabilities=probabilities, + p=probabilities, ) legacy_transform = legacy_transforms.RandomChoice( [ @@ -1096,7 +1099,7 @@ def make_label(extra_dims, categories): v2_transforms.Compose( [ v2_transforms.RandomIoUCrop(), - v2_transforms.SanitizeBoundingBoxes(labels_getter=lambda sample: sample[1]["labels"]), + v2_transforms.SanitizeBoundingBox(labels_getter=lambda sample: sample[1]["labels"]), ] ), {"with_mask": False}, diff --git a/torchvision/__init__.py b/torchvision/__init__.py index f29da9cf644..eed24091a52 100644 --- a/torchvision/__init__.py +++ b/torchvision/__init__.py @@ -100,10 +100,11 @@ def _is_tracing(): _WARN_ABOUT_BETA_TRANSFORMS = True _BETA_TRANSFORMS_WARNING = ( "The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. " - "While we will try our best to maintain backward compatibility, " - "some APIs or behaviors might change without a deprecation cycle. " - "To help us improve these new features, please provide your feedback " - "here: https://github.com/pytorch/vision/issues/6753." + "While we do not expect major breaking changes, some APIs may still change " + "according to user feedback. Please submit any feedback you may have in " + "this issue: https://github.com/pytorch/vision/issues/6753, and you can also " + "check out https://github.com/pytorch/vision/issues/7319 to learn more about " + "the APIs that we suspect might involve future changes. " "You can silence this warning by calling torchvision.disable_beta_transform_warning()." ) diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py index 1dc46f8f21a..11d42f171e4 100644 --- a/torchvision/datapoints/_bounding_box.py +++ b/torchvision/datapoints/_bounding_box.py @@ -1,21 +1,44 @@ from __future__ import annotations +from enum import Enum from typing import Any, List, Optional, Sequence, Tuple, Union import torch -from torchvision._utils import StrEnum from torchvision.transforms import InterpolationMode # TODO: this needs to be moved out of transforms from ._datapoint import _FillTypeJIT, Datapoint -class BoundingBoxFormat(StrEnum): - XYXY = StrEnum.auto() - XYWH = StrEnum.auto() - CXCYWH = StrEnum.auto() +class BoundingBoxFormat(Enum): + """[BETA] Coordinate format of a bounding box. + + Available formats are + + * ``XYXY`` + * ``XYWH`` + * ``CXCYWH`` + """ + + XYXY = "XYXY" + XYWH = "XYWH" + CXCYWH = "CXCYWH" class BoundingBox(Datapoint): + """[BETA] :class:`torch.Tensor` subclass for bounding boxes. + + Args: + data: Any data that can be turned into a tensor with :func:`torch.as_tensor`. + format (BoundingBoxFormat, str): Format of the bounding box. + spatial_size (two-tuple of ints): Height and width of the corresponding image or video. + dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from + ``data``. + device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a + :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU. + requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and + ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``. + """ + format: BoundingBoxFormat spatial_size: Tuple[int, int] @@ -39,7 +62,7 @@ def __new__( tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad) if isinstance(format, str): - format = BoundingBoxFormat.from_str(format.upper()) + format = BoundingBoxFormat[format.upper()] return cls._wrap(tensor, format=format, spatial_size=spatial_size) @@ -52,6 +75,20 @@ def wrap_like( format: Optional[BoundingBoxFormat] = None, spatial_size: Optional[Tuple[int, int]] = None, ) -> BoundingBox: + """Wrap a :class:`torch.Tensor` as :class:`BoundingBox` from a reference. + + Args: + other (BoundingBox): Reference bounding box. + tensor (Tensor): Tensor to be wrapped as :class:`BoundingBox` + format (BoundingBoxFormat, str, optional): Format of the bounding box. If omitted, it is taken from the + reference. + spatial_size (two-tuple of ints, optional): Height and width of the corresponding image or video. If + omitted, it is taken from the reference. + + """ + if isinstance(format, str): + format = BoundingBoxFormat[format.upper()] + return cls._wrap( tensor, format=format if format is not None else other.format, diff --git a/torchvision/datapoints/_image.py b/torchvision/datapoints/_image.py index 21dfe5a5cd6..e47a6c10fc3 100644 --- a/torchvision/datapoints/_image.py +++ b/torchvision/datapoints/_image.py @@ -10,6 +10,19 @@ class Image(Datapoint): + """[BETA] :class:`torch.Tensor` subclass for images. + + Args: + data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as + well as PIL images. + dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from + ``data``. + device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a + :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU. + requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and + ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``. + """ + @classmethod def _wrap(cls, tensor: torch.Tensor) -> Image: image = tensor.as_subclass(cls) diff --git a/torchvision/datapoints/_mask.py b/torchvision/datapoints/_mask.py index bb70ec12224..0135d793d32 100644 --- a/torchvision/datapoints/_mask.py +++ b/torchvision/datapoints/_mask.py @@ -10,6 +10,19 @@ class Mask(Datapoint): + """[BETA] :class:`torch.Tensor` subclass for segmentation and detection masks. + + Args: + data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as + well as PIL images. + dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from + ``data``. + device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a + :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU. + requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and + ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``. + """ + @classmethod def _wrap(cls, tensor: torch.Tensor) -> Mask: return tensor.as_subclass(cls) diff --git a/torchvision/datapoints/_video.py b/torchvision/datapoints/_video.py index ab51c10233d..a6fbe2bd473 100644 --- a/torchvision/datapoints/_video.py +++ b/torchvision/datapoints/_video.py @@ -9,6 +9,18 @@ class Video(Datapoint): + """[BETA] :class:`torch.Tensor` subclass for videos. + + Args: + data (tensor-like): Any data that can be turned into a tensor with :func:`torch.as_tensor`. + dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from + ``data``. + device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a + :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU. + requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and + ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``. + """ + @classmethod def _wrap(cls, tensor: torch.Tensor) -> Video: video = tensor.as_subclass(cls) diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py index 90cb0374eee..95eb9199ef3 100644 --- a/torchvision/transforms/transforms.py +++ b/torchvision/transforms/transforms.py @@ -105,7 +105,9 @@ def __repr__(self) -> str: class ToTensor: - """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor. This transform does not support torchscript. + """Convert a PIL Image or ndarray to tensor and scale the values accordingly. + + This transform does not support torchscript. Converts a PIL Image or numpy.ndarray (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] @@ -139,7 +141,9 @@ def __repr__(self) -> str: class PILToTensor: - """Convert a ``PIL Image`` to a tensor of the same type. This transform does not support torchscript. + """Convert a PIL Image to a tensor of the same type - this does not scale values. + + This transform does not support torchscript. Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W). """ @@ -166,7 +170,8 @@ def __repr__(self) -> str: class ConvertImageDtype(torch.nn.Module): - """Convert a tensor image to the given ``dtype`` and scale the values accordingly + """Convert a tensor image to the given ``dtype`` and scale the values accordingly. + This function does not support PIL Image. Args: @@ -194,7 +199,9 @@ def forward(self, image): class ToPILImage: - """Convert a tensor or an ndarray to PIL Image. This transform does not support torchscript. + """Convert a tensor or an ndarray to PIL Image - this does not scale values. + + This transform does not support torchscript. Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape H x W x C to a PIL Image while preserving the value range. diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py index 7ad72c00934..6573446a33a 100644 --- a/torchvision/transforms/v2/__init__.py +++ b/torchvision/transforms/v2/__init__.py @@ -40,7 +40,7 @@ TenCrop, ) from ._meta import ClampBoundingBox, ConvertBoundingBoxFormat, ConvertDtype, ConvertImageDtype -from ._misc import GaussianBlur, Identity, Lambda, LinearTransformation, Normalize, SanitizeBoundingBoxes, ToDtype +from ._misc import GaussianBlur, Identity, Lambda, LinearTransformation, Normalize, SanitizeBoundingBox, ToDtype from ._temporal import UniformTemporalSubsample from ._type_conversion import PILToTensor, ToImagePIL, ToImageTensor, ToPILImage diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py index 157605d6f3c..937e3508a87 100644 --- a/torchvision/transforms/v2/_augment.py +++ b/torchvision/transforms/v2/_augment.py @@ -13,6 +13,38 @@ class RandomErasing(_RandomApplyTransform): + """[BETA] Randomly select a rectangle region in the input image or video and erase its pixels. + + .. v2betastatus:: RandomErasing transform + + This transform does not support PIL Image. + 'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896 + + Args: + p (float, optional): probability that the random erasing operation will be performed. + scale (tuple of float, optional): range of proportion of erased area against input image. + ratio (tuple of float, optional): range of aspect ratio of erased area. + value (number or tuple of numbers): erasing value. Default is 0. If a single int, it is used to + erase all pixels. If a tuple of length 3, it is used to erase + R, G, B channels respectively. + If a str of 'random', erasing each pixel with random values. + inplace (bool, optional): boolean to make this transform inplace. Default set to False. + + Returns: + Erased input. + + Example: + >>> from torchvision.transforms import v2 as transforms + >>> + >>> transform = transforms.Compose([ + >>> transforms.RandomHorizontalFlip(), + >>> transforms.PILToTensor(), + >>> transforms.ConvertImageDtype(torch.float), + >>> transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + >>> transforms.RandomErasing(), + >>> ]) + """ + _v1_transform_cls = _transforms.RandomErasing def _extract_params_for_v1_transform(self) -> Dict[str, Any]: diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py index b4791755dc5..34c0ced43d2 100644 --- a/torchvision/transforms/v2/_auto_augment.py +++ b/torchvision/transforms/v2/_auto_augment.py @@ -162,6 +162,26 @@ def _apply_image_or_video_transform( class AutoAugment(_AutoAugmentBase): + r"""[BETA] AutoAugment data augmentation method based on + `"AutoAugment: Learning Augmentation Strategies from Data" `_. + + .. v2betastatus:: AutoAugment transform + + This transformation works on images and videos only. + + If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + policy (AutoAugmentPolicy, optional): Desired policy enum defined by + :class:`torchvision.transforms.autoaugment.AutoAugmentPolicy`. Default is ``AutoAugmentPolicy.IMAGENET``. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + fill (sequence or number, optional): Pixel fill value for the area outside the transformed + image. If given a number, the value is used for all bands respectively. + """ _v1_transform_cls = _transforms.AutoAugment _AUGMENTATION_SPACE = { @@ -318,6 +338,29 @@ def forward(self, *inputs: Any) -> Any: class RandAugment(_AutoAugmentBase): + r"""[BETA] RandAugment data augmentation method based on + `"RandAugment: Practical automated data augmentation with a reduced search space" + `_. + + .. v2betastatus:: RandAugment transform + + This transformation works on images and videos only. + + If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + num_ops (int, optional): Number of augmentation transformations to apply sequentially. + magnitude (int, optional): Magnitude for all the transformations. + num_magnitude_bins (int, optional): The number of different magnitude values. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + fill (sequence or number, optional): Pixel fill value for the area outside the transformed + image. If given a number, the value is used for all bands respectively. + """ + _v1_transform_cls = _transforms.RandAugment _AUGMENTATION_SPACE = { "Identity": (lambda num_bins, height, width: None, False), @@ -379,6 +422,26 @@ def forward(self, *inputs: Any) -> Any: class TrivialAugmentWide(_AutoAugmentBase): + r"""[BETA] Dataset-independent data-augmentation with TrivialAugment Wide, as described in + `"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" `_. + + .. v2betastatus:: TrivialAugmentWide transform + + This transformation works on images and videos only. + + If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + num_magnitude_bins (int, optional): The number of different magnitude values. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + fill (sequence or number, optional): Pixel fill value for the area outside the transformed + image. If given a number, the value is used for all bands respectively. + """ + _v1_transform_cls = _transforms.TrivialAugmentWide _AUGMENTATION_SPACE = { "Identity": (lambda num_bins, height, width: None, False), @@ -430,6 +493,31 @@ def forward(self, *inputs: Any) -> Any: class AugMix(_AutoAugmentBase): + r"""[BETA] AugMix data augmentation method based on + `"AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty" `_. + + .. v2betastatus:: AugMix transform + + This transformation works on images and videos only. + + If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + severity (int, optional): The severity of base augmentation operators. Default is ``3``. + mixture_width (int, optional): The number of augmentation chains. Default is ``3``. + chain_depth (int, optional): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3]. + Default is ``-1``. + alpha (float, optional): The hyperparameter for the probability distributions. Default is ``1.0``. + all_ops (bool, optional): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + fill (sequence or number, optional): Pixel fill value for the area outside the transformed + image. If given a number, the value is used for all bands respectively. + """ + _v1_transform_cls = _transforms.AugMix _PARTIAL_AUGMENTATION_SPACE = { diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py index 64796e16ca4..4ad534c988b 100644 --- a/torchvision/transforms/v2/_color.py +++ b/torchvision/transforms/v2/_color.py @@ -11,6 +11,17 @@ class Grayscale(Transform): + """[BETA] Convert images or videos to grayscale. + + .. v2betastatus:: Grayscale transform + + If the input is a :class:`torch.Tensor`, it is expected + to have [..., 3 or 1, H, W] shape, where ... means an arbitrary number of leading dimensions + + Args: + num_output_channels (int): (1 or 3) number of channels desired for output image + """ + _v1_transform_cls = _transforms.Grayscale _transformed_types = ( @@ -29,6 +40,19 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomGrayscale(_RandomApplyTransform): + """[BETA] Randomly convert image or videos to grayscale with a probability of p (default 0.1). + + .. v2betastatus:: RandomGrayscale transform + + If the input is a :class:`torch.Tensor`, it is expected to have [..., 3 or 1, H, W] shape, + where ... means an arbitrary number of leading dimensions + + The output has the same number of channels as the input. + + Args: + p (float): probability that image should be converted to grayscale. + """ + _v1_transform_cls = _transforms.RandomGrayscale _transformed_types = ( @@ -50,6 +74,32 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class ColorJitter(Transform): + """[BETA] Randomly change the brightness, contrast, saturation and hue of an image or video. + + .. v2betastatus:: ColorJitter transform + + If the input is a :class:`torch.Tensor`, it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported. + + Args: + brightness (float or tuple of float (min, max)): How much to jitter brightness. + brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness] + or the given [min, max]. Should be non negative numbers. + contrast (float or tuple of float (min, max)): How much to jitter contrast. + contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast] + or the given [min, max]. Should be non-negative numbers. + saturation (float or tuple of float (min, max)): How much to jitter saturation. + saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation] + or the given [min, max]. Should be non negative numbers. + hue (float or tuple of float (min, max)): How much to jitter hue. + hue_factor is chosen uniformly from [-hue, hue] or the given [min, max]. + Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5. + To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space; + thus it does not work if you normalize your image to an interval with negative values, + or use an interpolation that generates negative values before using this function. + """ + _v1_transform_cls = _transforms.ColorJitter def _extract_params_for_v1_transform(self) -> Dict[str, Any]: @@ -129,6 +179,31 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: # TODO: This class seems to be untested class RandomPhotometricDistort(Transform): + """[BETA] Randomly distorts the image or video as used in `SSD: Single Shot + MultiBox Detector `_. + + .. v2betastatus:: RandomPhotometricDistort transform + + This transform relies on :class:`~torchvision.transforms.v2.ColorJitter` + under the hood to adjust the contrast, saturation, hue, brightness, and also + randomly permutes channels. + + Args: + brightness (tuple of float (min, max), optional): How much to jitter brightness. + brightness_factor is chosen uniformly from [min, max]. Should be non negative numbers. + contrast tuple of float (min, max), optional): How much to jitter contrast. + contrast_factor is chosen uniformly from [min, max]. Should be non-negative numbers. + saturation (tuple of float (min, max), optional): How much to jitter saturation. + saturation_factor is chosen uniformly from [min, max]. Should be non negative numbers. + hue (tuple of float (min, max), optional): How much to jitter hue. + hue_factor is chosen uniformly from [min, max]. Should have -0.5 <= min <= max <= 0.5. + To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space; + thus it does not work if you normalize your image to an interval with negative values, + or use an interpolation that generates negative values before using this function. + p (float, optional) probability each distortion operation (contrast, saturation, ...) to be applied. + Default is 0.5. + """ + _transformed_types = ( datapoints.Image, PIL.Image.Image, @@ -138,10 +213,10 @@ class RandomPhotometricDistort(Transform): def __init__( self, + brightness: Tuple[float, float] = (0.875, 1.125), contrast: Tuple[float, float] = (0.5, 1.5), saturation: Tuple[float, float] = (0.5, 1.5), hue: Tuple[float, float] = (-0.05, 0.05), - brightness: Tuple[float, float] = (0.875, 1.125), p: float = 0.5, ): super().__init__() @@ -205,6 +280,18 @@ def _transform( class RandomEqualize(_RandomApplyTransform): + """[BETA] Equalize the histogram of the given image or video with a given probability. + + .. v2betastatus:: RandomEqualize transform + + If the input is a :class:`torch.Tensor`, it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "P", "L" or "RGB". + + Args: + p (float): probability of the image being equalized. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomEqualize def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: @@ -212,6 +299,18 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomInvert(_RandomApplyTransform): + """[BETA] Inverts the colors of the given image or video with a given probability. + + .. v2betastatus:: RandomInvert transform + + If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format, + where ... means it can have an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + p (float): probability of the image being color inverted. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomInvert def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: @@ -219,6 +318,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomPosterize(_RandomApplyTransform): + """[BETA] Posterize the image or video with a given probability by reducing the + number of bits for each color channel. + + .. v2betastatus:: RandomPosterize transform + + If the input is a :class:`torch.Tensor`, it should be of type torch.uint8, + and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + bits (int): number of bits to keep for each channel (0-8) + p (float): probability of the image being posterized. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomPosterize def __init__(self, bits: int, p: float = 0.5) -> None: @@ -230,6 +343,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomSolarize(_RandomApplyTransform): + """[BETA] Solarize the image or video with a given probability by inverting all pixel + values above a threshold. + + .. v2betastatus:: RandomSolarize transform + + If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format, + where ... means it can have an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + threshold (float): all pixels equal or above this value are inverted. + p (float): probability of the image being solarized. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomSolarize def __init__(self, threshold: float, p: float = 0.5) -> None: @@ -241,6 +368,18 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomAutocontrast(_RandomApplyTransform): + """[BETA] Autocontrast the pixels of the given image or video with a given probability. + + .. v2betastatus:: RandomAutocontrast transform + + If the input is a :class:`torch.Tensor`, it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + p (float): probability of the image being autocontrasted. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomAutocontrast def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: @@ -248,6 +387,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomAdjustSharpness(_RandomApplyTransform): + """[BETA] Adjust the sharpness of the image or video with a given probability. + + .. v2betastatus:: RandomAdjustSharpness transform + + If the input is a :class:`torch.Tensor`, + it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + + Args: + sharpness_factor (float): How much to adjust the sharpness. Can be + any non-negative number. 0 gives a blurred image, 1 gives the + original image while 2 increases the sharpness by a factor of 2. + p (float): probability of the image being sharpened. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomAdjustSharpness def __init__(self, sharpness_factor: float, p: float = 0.5) -> None: diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py index 555010fda1e..fffef4157bd 100644 --- a/torchvision/transforms/v2/_container.py +++ b/torchvision/transforms/v2/_container.py @@ -1,4 +1,3 @@ -import warnings from typing import Any, Callable, Dict, List, Optional, Sequence, Union import torch @@ -9,6 +8,37 @@ class Compose(Transform): + """[BETA] Composes several transforms together. + + .. v2betastatus:: Compose transform + + This transform does not support torchscript. + Please, see the note below. + + Args: + transforms (list of ``Transform`` objects): list of transforms to compose. + + Example: + >>> transforms.Compose([ + >>> transforms.CenterCrop(10), + >>> transforms.PILToTensor(), + >>> transforms.ConvertImageDtype(torch.float), + >>> ]) + + .. note:: + In order to script the transformations, please use ``torch.nn.Sequential`` as below. + + >>> transforms = torch.nn.Sequential( + >>> transforms.CenterCrop(10), + >>> transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + >>> ) + >>> scripted_transforms = torch.jit.script(transforms) + + Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require + `lambda` functions or ``PIL.Image``. + + """ + def __init__(self, transforms: Sequence[Callable]) -> None: super().__init__() if not isinstance(transforms, Sequence): @@ -29,6 +59,27 @@ def extra_repr(self) -> str: class RandomApply(Transform): + """[BETA] Apply randomly a list of transformations with a given probability. + + .. v2betastatus:: RandomApply transform + + .. note:: + In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of + transforms as shown below: + + >>> transforms = transforms.RandomApply(torch.nn.ModuleList([ + >>> transforms.ColorJitter(), + >>> ]), p=0.3) + >>> scripted_transforms = torch.jit.script(transforms) + + Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require + `lambda` functions or ``PIL.Image``. + + Args: + transforms (sequence or torch.nn.Module): list of transformations + p (float): probability of applying the list of transforms + """ + _v1_transform_cls = _transforms.RandomApply def __init__(self, transforms: Union[Sequence[Callable], nn.ModuleList], p: float = 0.5) -> None: @@ -63,42 +114,55 @@ def extra_repr(self) -> str: class RandomChoice(Transform): + """[BETA] Apply single transformation randomly picked from a list. + + .. v2betastatus:: RandomChoice transform + + This transform does not support torchscript. + + Args: + transforms (sequence or torch.nn.Module): list of transformations + p (list of floats or None, optional): probability of each transform being picked. + If ``p`` doesn't sum to 1, it is automatically normalized. If ``None`` + (default), all transforms have the same probability. + """ + def __init__( self, transforms: Sequence[Callable], - probabilities: Optional[List[float]] = None, p: Optional[List[float]] = None, ) -> None: if not isinstance(transforms, Sequence): raise TypeError("Argument transforms should be a sequence of callables") - if p is not None: - warnings.warn( - "Argument p is deprecated and will be removed in a future release. " - "Please use probabilities argument instead." - ) - probabilities = p - - if probabilities is None: - probabilities = [1] * len(transforms) - elif len(probabilities) != len(transforms): - raise ValueError( - f"The number of probabilities doesn't match the number of transforms: " - f"{len(probabilities)} != {len(transforms)}" - ) + + if p is None: + p = [1] * len(transforms) + elif len(p) != len(transforms): + raise ValueError(f"Length of p doesn't match the number of transforms: {len(p)} != {len(transforms)}") super().__init__() self.transforms = transforms - total = sum(probabilities) - self.probabilities = [prob / total for prob in probabilities] + total = sum(p) + self.p = [prob / total for prob in p] def forward(self, *inputs: Any) -> Any: - idx = int(torch.multinomial(torch.tensor(self.probabilities), 1)) + idx = int(torch.multinomial(torch.tensor(self.p), 1)) transform = self.transforms[idx] return transform(*inputs) class RandomOrder(Transform): + """[BETA] Apply a list of transformations in a random order. + + .. v2betastatus:: RandomOrder transform + + This transform does not support torchscript. + + Args: + transforms (sequence or torch.nn.Module): list of transformations + """ + def __init__(self, transforms: Sequence[Callable]) -> None: if not isinstance(transforms, Sequence): raise TypeError("Argument transforms should be a sequence of callables") diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py index bfb0d06239f..e900e853d2b 100644 --- a/torchvision/transforms/v2/_deprecated.py +++ b/torchvision/transforms/v2/_deprecated.py @@ -10,6 +10,31 @@ class ToTensor(Transform): + """[BETA] Convert a PIL Image or ndarray to tensor and scale the values accordingly. + + .. v2betastatus:: ToTensor transform + + .. warning:: + :class:`v2.ToTensor` is deprecated and will be removed in a future release. + Please use instead ``transforms.Compose([transforms.ToImageTensor(), transforms.ConvertImageDtype()])``. + + This transform does not support torchscript. + + + Converts a PIL Image or numpy.ndarray (H x W x C) in the range + [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] + if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1) + or if the numpy.ndarray has dtype = np.uint8 + + In the other cases, tensors are returned without scaling. + + .. note:: + Because the input image is scaled to [0.0, 1.0], this transformation should not be used when + transforming target image masks. See the `references`_ for implementing the transforms for image masks. + + .. _references: https://github.com/pytorch/vision/tree/main/references/segmentation + """ + _transformed_types = (PIL.Image.Image, np.ndarray) def __init__(self) -> None: diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py index f1eed87b9c0..59791c30b9d 100644 --- a/torchvision/transforms/v2/_geometry.py +++ b/torchvision/transforms/v2/_geometry.py @@ -26,6 +26,19 @@ class RandomHorizontalFlip(_RandomApplyTransform): + """[BETA] Horizontally flip the input with a given probability. + + .. v2betastatus:: RandomHorizontalFlip transform + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Args: + p (float, optional): probability of the input being flipped. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomHorizontalFlip def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: @@ -33,6 +46,19 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomVerticalFlip(_RandomApplyTransform): + """[BETA] Vertically flip the input with a given probability. + + .. v2betastatus:: RandomVerticalFlip transform + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Args: + p (float, optional): probability of the input being flipped. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomVerticalFlip def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: @@ -40,6 +66,64 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class Resize(Transform): + """[BETA] Resize the input to the given size. + + .. v2betastatus:: Resize transform + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + .. warning:: + The output image might be different depending on its type: when downsampling, the interpolation of PIL images + and tensors is slightly different, because PIL applies antialiasing. This may lead to significant differences + in the performance of a network. Therefore, it is preferable to train and serve a model with the same input + types. See also below the ``antialias`` parameter, which can help making the output of PIL images and tensors + closer. + + Args: + size (sequence or int): Desired output size. If size is a sequence like + (h, w), output size will be matched to this. If size is an int, + smaller edge of the image will be matched to this number. + i.e, if height > width, then image will be rescaled to + (size * height / width, size). + + .. note:: + In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``, + ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + max_size (int, optional): The maximum allowed for the longer edge of + the resized image: if the longer edge of the image is greater + than ``max_size`` after being resized according to ``size``, then + the image is resized again so that the longer edge is equal to + ``max_size``. As a result, ``size`` might be overruled, i.e. the + smaller edge may be shorter than ``size``. This is only supported + if ``size`` is an int (or a sequence of length 1 in torchscript + mode). + antialias (bool, optional): Whether to apply antialiasing. + It only affects **tensors** with bilinear or bicubic modes and it is + ignored otherwise: on PIL images, antialiasing is always applied on + bilinear or bicubic modes; on other modes (for PIL images and + tensors), antialiasing makes no sense and this parameter is ignored. + Possible values are: + + - ``True``: will apply antialiasing for bilinear or bicubic modes. + Other mode aren't affected. This is probably what you want to use. + - ``False``: will not apply antialiasing for tensors on any mode. PIL + images are still antialiased on bilinear or bicubic modes, because + PIL doesn't support no antialias. + - ``None``: equivalent to ``False`` for tensors and ``True`` for + PIL images. This value exists for legacy reasons and you probably + don't want to use it unless you really know what you are doing. + + The current default is ``None`` **but will change to** ``True`` **in + v0.17** for the PIL and Tensor backends to be consistent. + """ + _v1_transform_cls = _transforms.Resize def __init__( @@ -76,6 +160,23 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class CenterCrop(Transform): + """[BETA] Crop the input at the center. + + .. v2betastatus:: CenterCrop transform + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + If image size is smaller than output size along any edge, image is padded with 0 and then center cropped. + + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + """ + _v1_transform_cls = _transforms.CenterCrop def __init__(self, size: Union[int, Sequence[int]]): @@ -87,6 +188,55 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomResizedCrop(Transform): + """[BETA] Crop a random portion of the input and resize it to a given size. + + .. v2betastatus:: RandomResizedCrop transform + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + A crop of the original input is made: the crop has a random area (H * W) + and a random aspect ratio. This crop is finally resized to the given + size. This is popularly used to train the Inception networks. + + Args: + size (int or sequence): expected output size of the crop, for each edge. If size is an + int instead of sequence like (h, w), a square output size ``(size, size)`` is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + + .. note:: + In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``. + scale (tuple of float, optional): Specifies the lower and upper bounds for the random area of the crop, + before resizing. The scale is defined with respect to the area of the original image. + ratio (tuple of float, optional): lower and upper bounds for the random aspect ratio of the crop, before + resizing. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``, + ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + antialias (bool, optional): Whether to apply antialiasing. + It only affects **tensors** with bilinear or bicubic modes and it is + ignored otherwise: on PIL images, antialiasing is always applied on + bilinear or bicubic modes; on other modes (for PIL images and + tensors), antialiasing makes no sense and this parameter is ignored. + Possible values are: + + - ``True``: will apply antialiasing for bilinear or bicubic modes. + Other mode aren't affected. This is probably what you want to use. + - ``False``: will not apply antialiasing for tensors on any mode. PIL + images are still antialiased on bilinear or bicubic modes, because + PIL doesn't support no antialias. + - ``None``: equivalent to ``False`` for tensors and ``True`` for + PIL images. This value exists for legacy reasons and you probably + don't want to use it unless you really know what you are doing. + + The current default is ``None`` **but will change to** ``True`` **in + v0.17** for the PIL and Tensor backends to be consistent. + """ + _v1_transform_cls = _transforms.RandomResizedCrop def __init__( @@ -164,7 +314,24 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class FiveCrop(Transform): - """ + """[BETA] Crop the image or video into four corners and the central crop. + + .. v2betastatus:: FiveCrop transform + + If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a + :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions. + For example, the image can have ``[..., C, H, W]`` shape. + + .. Note:: + This transform returns a tuple of images and there may be a mismatch in the number of + inputs and targets your Dataset returns. See below for an example of how to deal with + this. + + Args: + size (sequence or int): Desired output size of the crop. If size is an ``int`` + instead of sequence like (h, w), a square crop of size (size, size) is made. + If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + Example: >>> class BatchMultiCrop(transforms.Transform): ... def forward(self, sample: Tuple[Tuple[Union[datapoints.Image, datapoints.Video], ...], int]): @@ -209,8 +376,27 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None: class TenCrop(Transform): - """ + """[BETA] Crop the image or video into four corners and the central crop plus the flipped version of + these (horizontal flipping is used by default). + + .. v2betastatus:: TenCrop transform + + If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a + :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions. + For example, the image can have ``[..., C, H, W]`` shape. + See :class:`~torchvision.transforms.v2.FiveCrop` for an example. + + .. Note:: + This transform returns a tuple of images and there may be a mismatch in the number of + inputs and targets your Dataset returns. See below for an example of how to deal with + this. + + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + vertical_flip (bool, optional): Use vertical flipping instead of horizontal """ _v1_transform_cls = _transforms.TenCrop @@ -249,6 +435,45 @@ def _transform( class Pad(Transform): + """[BETA] Pad the input on all sides with the given "pad" value. + + .. v2betastatus:: Pad transform + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Args: + padding (int or sequence): Padding on each border. If a single int is provided this + is used to pad all borders. If sequence of length 2 is provided this is the padding + on left/right and top/bottom respectively. If a sequence of length 4 is provided + this is the padding for the left, top, right and bottom borders respectively. + + .. note:: + In torchscript mode padding as single int is not supported, use a sequence of + length 1: ``[padding, ]``. + fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. + Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. + Fill value can be also a dictionary mapping data type to the fill value, e.g. + ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``Mask`` will be filled with 0. + padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric. + Default is "constant". + + - constant: pads with a constant value, this value is specified with fill + + - edge: pads with the last value at the edge of the image. + + - reflect: pads with reflection of image without repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode + will result in [3, 2, 1, 2, 3, 4, 3, 2] + + - symmetric: pads with reflection of image repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode + will result in [2, 1, 1, 2, 3, 4, 4, 3] + """ + _v1_transform_cls = _transforms.Pad def _extract_params_for_v1_transform(self) -> Dict[str, Any]: @@ -284,6 +509,37 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomZoomOut(_RandomApplyTransform): + """[BETA] "Zoom out" transformation from + `"SSD: Single Shot MultiBox Detector" `_. + + .. v2betastatus:: RandomZoomOut transform + + This transformation randomly pads images, videos, bounding boxes and masks creating a zoom out effect. + Output spatial size is randomly sampled from original size up to a maximum size configured + with ``side_range`` parameter: + + .. code-block:: python + + r = uniform_sample(side_range[0], side_range[1]) + output_width = input_width * r + output_height = input_height * r + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Args: + fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. + Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. + Fill value can be also a dictionary mapping data type to the fill value, e.g. + ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``Mask`` will be filled with 0. + side_range (sequence of floats, optional): tuple of two floats defines minimum and maximum factors to + scale the input size. + p (float, optional): probability of the input being flipped. Default value is 0.5 + """ + def __init__( self, fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0, @@ -323,6 +579,39 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomRotation(Transform): + """[BETA] Rotate the input by angle. + + .. v2betastatus:: RandomRotation transform + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Args: + degrees (sequence or number): Range of degrees to select from. + If degrees is a number instead of sequence like (min, max), the range of degrees + will be (-degrees, +degrees). + interpolation (InterpolationMode, optional): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + expand (bool, optional): Optional expansion flag. + If true, expands the output to make it large enough to hold the entire rotated image. + If false or omitted, make the output image the same size as the input image. + Note that the expand flag assumes rotation around the center and no translation. + center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner. + Default is the center of the image. + fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. + Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. + Fill value can be also a dictionary mapping data type to the fill value, e.g. + ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``Mask`` will be filled with 0. + + .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters + + """ + _v1_transform_cls = _transforms.RandomRotation def __init__( @@ -330,8 +619,8 @@ def __init__( degrees: Union[numbers.Number, Sequence], interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST, expand: bool = False, - fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0, center: Optional[List[float]] = None, + fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0, ) -> None: super().__init__() self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,)) @@ -363,6 +652,47 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomAffine(Transform): + """[BETA] Random affine transformation the input keeping center invariant. + + .. v2betastatus:: RandomAffine transform + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Args: + degrees (sequence or number): Range of degrees to select from. + If degrees is a number instead of sequence like (min, max), the range of degrees + will be (-degrees, +degrees). Set to 0 to deactivate rotations. + translate (tuple, optional): tuple of maximum absolute fraction for horizontal + and vertical translations. For example translate=(a, b), then horizontal shift + is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is + randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default. + scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is + randomly sampled from the range a <= scale <= b. Will keep original scale by default. + shear (sequence or number, optional): Range of degrees to select from. + If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear) + will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the + range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values, + an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied. + Will not apply shear by default. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. + Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. + Fill value can be also a dictionary mapping data type to the fill value, e.g. + ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``Mask`` will be filled with 0. + center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner. + Default is the center of the image. + + .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters + + """ + _v1_transform_cls = _transforms.RandomAffine def __init__( @@ -443,6 +773,52 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomCrop(Transform): + """[BETA] Crop the input at a random location. + + .. v2betastatus:: RandomCrop transform + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + padding (int or sequence, optional): Optional padding on each border + of the image. Default is None. If a single int is provided this + is used to pad all borders. If sequence of length 2 is provided this is the padding + on left/right and top/bottom respectively. If a sequence of length 4 is provided + this is the padding for the left, top, right and bottom borders respectively. + + .. note:: + In torchscript mode padding as single int is not supported, use a sequence of + length 1: ``[padding, ]``. + pad_if_needed (boolean, optional): It will pad the image if smaller than the + desired size to avoid raising an exception. Since cropping is done + after padding, the padding seems to be done at a random offset. + fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. + Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. + Fill value can be also a dictionary mapping data type to the fill value, e.g. + ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``Mask`` will be filled with 0. + padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric. + Default is constant. + + - constant: pads with a constant value, this value is specified with fill + + - edge: pads with the last value at the edge of the image. + + - reflect: pads with reflection of image without repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode + will result in [3, 2, 1, 2, 3, 4, 3, 2] + + - symmetric: pads with reflection of image repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode + will result in [2, 1, 1, 2, 3, 4, 4, 3] + """ + _v1_transform_cls = _transforms.RandomCrop def _extract_params_for_v1_transform(self) -> Dict[str, Any]: @@ -552,14 +928,38 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomPerspective(_RandomApplyTransform): + """[BETA] Perform a random perspective transformation of the input with a given probability. + + .. v2betastatus:: RandomPerspective transform + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Args: + distortion_scale (float, optional): argument to control the degree of distortion and ranges from 0 to 1. + Default is 0.5. + p (float, optional): probability of the input being transformed. Default is 0.5. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. + Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. + Fill value can be also a dictionary mapping data type to the fill value, e.g. + ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``Mask`` will be filled with 0. + """ + _v1_transform_cls = _transforms.RandomPerspective def __init__( self, distortion_scale: float = 0.5, - fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0, - interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, p: float = 0.5, + interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, + fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0, ) -> None: super().__init__(p=p) @@ -614,14 +1014,54 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class ElasticTransform(Transform): + """[BETA] Transform the input with elastic transformations. + + .. v2betastatus:: RandomPerspective transform + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Given alpha and sigma, it will generate displacement + vectors for all pixels based on random offsets. Alpha controls the strength + and sigma controls the smoothness of the displacements. + The displacements are added to an identity grid and the resulting grid is + used to transform the input. + + .. note:: + Implementation to transform bounding boxes is approximative (not exact). + We construct an approximation of the inverse grid as ``inverse_grid = idenity - displacement``. + This is not an exact inverse of the grid used to transform images, i.e. ``grid = identity + displacement``. + Our assumption is that ``displacement * displacement`` is small and can be ignored. + Large displacements would lead to large errors in the approximation. + + Applications: + Randomly transforms the morphology of objects in images and produces a + see-through-water-like effect. + + Args: + alpha (float or sequence of floats, optional): Magnitude of displacements. Default is 50.0. + sigma (float or sequence of floats, optional): Smoothness of displacements. Default is 5.0. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. + Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. + Fill value can be also a dictionary mapping data type to the fill value, e.g. + ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``Mask`` will be filled with 0. + """ + _v1_transform_cls = _transforms.ElasticTransform def __init__( self, alpha: Union[float, Sequence[float]] = 50.0, sigma: Union[float, Sequence[float]] = 5.0, - fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0, interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, + fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0, ) -> None: super().__init__() self.alpha = _setup_float_or_seq(alpha, "alpha", 2) @@ -665,6 +1105,34 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomIoUCrop(Transform): + """[BETA] Random IoU crop transformation from + `"SSD: Single Shot MultiBox Detector" `_. + + .. v2betastatus:: RandomIoUCrop transform + + This transformation requires an image or video data and ``datapoints.BoundingBox`` in the input. + + .. warning:: + In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop` + must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBox`, either immediately + after or later in the transforms pipeline. + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Args: + min_scale (float, optional): Minimum factors to scale the input size. + max_scale (float, optional): Maximum factors to scale the input size. + min_aspect_ratio (float, optional): Minimum aspect ratio for the cropped image or video. + max_aspect_ratio (float, optional): Maximum aspect ratio for the cropped image or video. + sampler_options (list of float, optional): List of minimal IoU (Jaccard) overlap between all the boxes and + a cropped image or video. Default, ``None`` which corresponds to ``[0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]`` + trials (int, optional): Number of trials to find a crop for a given value of minimal IoU (Jaccard) overlap. + Default, 40. + """ + def __init__( self, min_scale: float = 0.3, @@ -754,13 +1222,52 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: if isinstance(output, datapoints.BoundingBox): # We "mark" the invalid boxes as degenreate, and they can be - # removed by a later call to SanitizeBoundingBoxes() + # removed by a later call to SanitizeBoundingBox() output[~params["is_within_crop_area"]] = 0 return output class ScaleJitter(Transform): + """[BETA] Perform Large Scale Jitter on the input according to + `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" `_. + + .. v2betastatus:: ScaleJitter transform + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Args: + target_size (tuple of int): Target size. This parameter defines base scale for jittering, + e.g. ``min(target_size[0] / width, target_size[1] / height)``. + scale_range (tuple of float, optional): Minimum and maximum of the scale range. Default, ``(0.1, 2.0)``. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``, + ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + antialias (bool, optional): Whether to apply antialiasing. + It only affects **tensors** with bilinear or bicubic modes and it is + ignored otherwise: on PIL images, antialiasing is always applied on + bilinear or bicubic modes; on other modes (for PIL images and + tensors), antialiasing makes no sense and this parameter is ignored. + Possible values are: + + - ``True``: will apply antialiasing for bilinear or bicubic modes. + Other mode aren't affected. This is probably what you want to use. + - ``False``: will not apply antialiasing for tensors on any mode. PIL + images are still antialiased on bilinear or bicubic modes, because + PIL doesn't support no antialias. + - ``None``: equivalent to ``False`` for tensors and ``True`` for + PIL images. This value exists for legacy reasons and you probably + don't want to use it unless you really know what you are doing. + + The current default is ``None`` **but will change to** ``True`` **in + v0.17** for the PIL and Tensor backends to be consistent. + """ + def __init__( self, target_size: Tuple[int, int], @@ -789,6 +1296,43 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomShortestSize(Transform): + """[BETA] Randomly resize the input. + + .. v2betastatus:: RandomShortestSize transform + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Args: + min_size (int or sequence of int): Minimum spatial size. Single integer value or a sequence of integer values. + max_size (int, optional): Maximum spatial size. Default, None. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``, + ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + antialias (bool, optional): Whether to apply antialiasing. + It only affects **tensors** with bilinear or bicubic modes and it is + ignored otherwise: on PIL images, antialiasing is always applied on + bilinear or bicubic modes; on other modes (for PIL images and + tensors), antialiasing makes no sense and this parameter is ignored. + Possible values are: + + - ``True``: will apply antialiasing for bilinear or bicubic modes. + Other mode aren't affected. This is probably what you want to use. + - ``False``: will not apply antialiasing for tensors on any mode. PIL + images are still antialiased on bilinear or bicubic modes, because + PIL doesn't support no antialias. + - ``None``: equivalent to ``False`` for tensors and ``True`` for + PIL images. This value exists for legacy reasons and you probably + don't want to use it unless you really know what you are doing. + + The current default is ``None`` **but will change to** ``True`` **in + v0.17** for the PIL and Tensor backends to be consistent. + """ + def __init__( self, min_size: Union[List[int], Tuple[int], int], @@ -820,6 +1364,54 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomResize(Transform): + """[BETA] Randomly resize the input. + + .. v2betastatus:: RandomResize transform + + This transformation can be used together with ``RandomCrop`` as data augmentations to train + models on image segmentation task. + + Output spatial size is randomly sampled from the interval ``[min_size, max_size]``: + + .. code-block:: python + + size = uniform_sample(min_size, max_size) + output_width = size + output_height = size + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Args: + min_size (int): Minimum output size for random sampling + max_size (int): Maximum output size for random sampling + interpolation (InterpolationMode, optional): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``, + ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + antialias (bool, optional): Whether to apply antialiasing. + It only affects **tensors** with bilinear or bicubic modes and it is + ignored otherwise: on PIL images, antialiasing is always applied on + bilinear or bicubic modes; on other modes (for PIL images and + tensors), antialiasing makes no sense and this parameter is ignored. + Possible values are: + + - ``True``: will apply antialiasing for bilinear or bicubic modes. + Other mode aren't affected. This is probably what you want to use. + - ``False``: will not apply antialiasing for tensors on any mode. PIL + images are still antialiased on bilinear or bicubic modes, because + PIL doesn't support no antialias. + - ``None``: equivalent to ``False`` for tensors and ``True`` for + PIL images. This value exists for legacy reasons and you probably + don't want to use it unless you really know what you are doing. + + The current default is ``None`` **but will change to** ``True`` **in + v0.17** for the PIL and Tensor backends to be consistent. + """ + def __init__( self, min_size: int, diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py index 0d1544094ca..b7e2a42259f 100644 --- a/torchvision/transforms/v2/_meta.py +++ b/torchvision/transforms/v2/_meta.py @@ -9,6 +9,16 @@ class ConvertBoundingBoxFormat(Transform): + """[BETA] Convert bounding box coordinates to the given ``format``, eg from "CXCYWH" to "XYXY". + + .. v2betastatus:: ConvertBoundingBoxFormat transform + + Args: + format (str or datapoints.BoundingBoxFormat): output bounding box format. + Possible values are defined by :class:`~torchvision.datapoints.BoundingBoxFormat` and + string values match the enums, e.g. "XYXY" or "XYWH" etc. + """ + _transformed_types = (datapoints.BoundingBox,) def __init__(self, format: Union[str, datapoints.BoundingBoxFormat]) -> None: @@ -22,6 +32,27 @@ def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> da class ConvertDtype(Transform): + """[BETA] Convert input image or video to the given ``dtype`` and scale the values accordingly. + + .. v2betastatus:: ConvertDtype transform + + This function does not support PIL Image. + + Args: + dtype (torch.dtype): Desired data type of the output + + .. note:: + + When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly. + If converted back and forth, this mismatch has no effect. + + Raises: + RuntimeError: When trying to cast :class:`torch.float32` to :class:`torch.int32` or :class:`torch.int64` as + well as for trying to cast :class:`torch.float64` to :class:`torch.int64`. These conversions might lead to + overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range + of the integer ``dtype``. + """ + _v1_transform_cls = _transforms.ConvertImageDtype _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video) @@ -42,6 +73,14 @@ def _transform( class ClampBoundingBox(Transform): + """[BETA] Clamp bounding boxes to their corresponding image dimensions. + + The clamping is done according to the bounding boxes' ``spatial_size`` meta-data. + + .. v2betastatus:: ClampBoundingBox transform + + """ + _transformed_types = (datapoints.BoundingBox,) def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox: diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index 6dd0755cfbb..c9b9025ebd9 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -1,7 +1,7 @@ import collections import warnings from contextlib import suppress -from typing import Any, Callable, cast, Dict, List, Optional, Sequence, Type, Union +from typing import Any, Callable, cast, Dict, List, Mapping, Optional, Sequence, Type, Union import PIL.Image @@ -15,12 +15,23 @@ from .utils import has_any, is_simple_tensor, query_bounding_box +# TODO: do we want/need to expose this? class Identity(Transform): def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: return inpt class Lambda(Transform): + """[BETA] Apply a user-defined function as a transform. + + .. v2betastatus:: Lambda transform + + This transform does not support torchscript. + + Args: + lambd (function): Lambda/function to be used for transform. + """ + def __init__(self, lambd: Callable[[Any], Any], *types: Type): super().__init__() self.lambd = lambd @@ -42,6 +53,26 @@ def extra_repr(self) -> str: class LinearTransformation(Transform): + """[BETA] Transform a tensor image or video with a square transformation matrix and a mean_vector computed offline. + + .. v2betastatus:: LinearTransformation transform + + This transform does not support PIL Image. + Given transformation_matrix and mean_vector, will flatten the torch.*Tensor and + subtract mean_vector from it which is then followed by computing the dot + product with the transformation matrix and then reshaping the tensor to its + original shape. + + Applications: + whitening transformation: Suppose X is a column vector zero-centered data. + Then compute the data covariance matrix [D x D] with torch.mm(X.t(), X), + perform SVD on this matrix and pass it as transformation_matrix. + + Args: + transformation_matrix (Tensor): tensor [D x D], D = C x H x W + mean_vector (Tensor): tensor [D], D = C x H x W + """ + _v1_transform_cls = _transforms.LinearTransformation _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video) @@ -105,6 +136,26 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class Normalize(Transform): + """[BETA] Normalize a tensor image or video with mean and standard deviation. + + .. v2betastatus:: Normalize transform + + This transform does not support PIL Image. + Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n`` + channels, this transform will normalize each channel of the input + ``torch.*Tensor`` i.e., + ``output[channel] = (input[channel] - mean[channel]) / std[channel]`` + + .. note:: + This transform acts out of place, i.e., it does not mutate the input tensor. + + Args: + mean (sequence): Sequence of means for each channel. + std (sequence): Sequence of standard deviations for each channel. + inplace(bool,optional): Bool to make this operation in-place. + + """ + _v1_transform_cls = _transforms.Normalize _transformed_types = (datapoints.Image, is_simple_tensor, datapoints.Video) @@ -125,6 +176,21 @@ def _transform( class GaussianBlur(Transform): + """[BETA] Blurs image with randomly chosen Gaussian blur. + + .. v2betastatus:: GausssianBlur transform + + If the input is a Tensor, it is expected + to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions. + + Args: + kernel_size (int or sequence): Size of the Gaussian kernel. + sigma (float or tuple of float (min, max)): Standard deviation to be used for + creating kernel to perform blurring. If float, sigma is fixed. If it is tuple + of float (min, max), sigma is chosen uniformly at random to lie in the + given range. + """ + _v1_transform_cls = _transforms.GaussianBlur def __init__( @@ -157,6 +223,17 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class ToDtype(Transform): + """[BETA] Converts the input to a specific dtype - this does not scale values. + + .. v2betastatus:: ToDtype transform + + Args: + dtype (``torch.dtype`` or dict of ``Datapoint`` -> ``torch.dtype``): The dtype to convert to. + A dict can be passed to specify per-datapoint conversions, e.g. + ``dtype={datapoints.Image: torch.float32, datapoints.Video: + torch.float64}``. + """ + _transformed_types = (torch.Tensor,) def __init__(self, dtype: Union[torch.dtype, Dict[Type, Optional[torch.dtype]]]) -> None: @@ -178,10 +255,34 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: return inpt.to(dtype=dtype) -class SanitizeBoundingBoxes(Transform): - # This removes boxes and their corresponding labels: - # - small or degenerate bboxes based on min_size (this includes those where X2 <= X1 or Y2 <= Y1) - # - boxes with any coordinate outside the range of the image (negative, or > spatial_size) +class SanitizeBoundingBox(Transform): + """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and masks. + + .. v2betastatus:: SanitizeBoundingBox transform + + This transform removes bounding boxes and their associated labels/masks that: + + - are below a given ``min_size``: by default this also removes degenerate boxes that have e.g. X2 <= X1. + - have any coordinate outside of their corresponding image. You may want to + call :class:`~torchvision.transforms.v2.ClampBoundingBox` first to avoid undesired removals. + + It is recommended to call it at the end of a pipeline, before passing the + input to the models. It is critical to call this transform if + :class:`~torchvision.transforms.v2.RandomIoUCrop` was called. + If you want to be extra careful, you may call it after all transforms that + may modify bounding boxes but once at the end should be enough in most + cases. + + Args: + min_size (float, optional) The size below which bounding boxes are removed. Default is 1. + labels_getter (callable or str or None, optional): indicates how to identify the labels in the input. + It can be a str in which case the input is expected to be a dict, and ``labels_getter`` then specifies + the key whose value corresponds to the labels. It can also be a callable that takes the same input + as the transform, and returns the labels. + By default, this will try to find a "labels" key in the input, if + the input is a dict or it is a tuple whose second element is a dict. + This heuristic should work well with a lot of datasets, including the built-in torchvision datasets. + """ def __init__( self, @@ -201,7 +302,9 @@ def __init__( elif callable(labels_getter): self._labels_getter = labels_getter elif isinstance(labels_getter, str): - self._labels_getter = lambda inputs: inputs[labels_getter] + self._labels_getter = lambda inputs: SanitizeBoundingBox._get_dict_or_second_tuple_entry(inputs)[ + labels_getter # type: ignore[index] + ] elif labels_getter is None: self._labels_getter = None else: @@ -210,10 +313,27 @@ def __init__( f"Got {labels_getter} of type {type(labels_getter)}." ) + @staticmethod + def _get_dict_or_second_tuple_entry(inputs: Any) -> Mapping[str, Any]: + # datasets outputs may be plain dicts like {"img": ..., "labels": ..., "bbox": ...} + # or tuples like (img, {"labels":..., "bbox": ...}) + # This hacky helper accounts for both structures. + if isinstance(inputs, tuple): + inputs = inputs[1] + + if not isinstance(inputs, collections.abc.Mapping): + raise ValueError( + f"If labels_getter is a str or 'default', " + f"then the input to forward() must be a dict or a tuple whose second element is a dict." + f" Got {type(inputs)} instead." + ) + return inputs + @staticmethod def _find_labels_default_heuristic(inputs: Dict[str, Any]) -> Optional[torch.Tensor]: - # Tries to find a "label" key, otherwise tries for the first key that contains "label" - case insensitive + # Tries to find a "labels" key, otherwise tries for the first key that contains "label" - case insensitive # Returns None if nothing is found + inputs = SanitizeBoundingBox._get_dict_or_second_tuple_entry(inputs) candidate_key = None with suppress(StopIteration): candidate_key = next(key for key in inputs.keys() if key.lower() == "labels") @@ -230,12 +350,6 @@ def _find_labels_default_heuristic(inputs: Dict[str, Any]) -> Optional[torch.Ten def forward(self, *inputs: Any) -> Any: inputs = inputs if len(inputs) > 1 else inputs[0] - if isinstance(self.labels_getter, str) and not isinstance(inputs, collections.abc.Mapping): - raise ValueError( - f"If labels_getter is a str or 'default' (got {self.labels_getter}), " - f"then the input to forward() must be a dict. Got {type(inputs)} instead." - ) - if self._labels_getter is None: labels = None else: diff --git a/torchvision/transforms/v2/_temporal.py b/torchvision/transforms/v2/_temporal.py index b26d6b0450f..df4ad66643a 100644 --- a/torchvision/transforms/v2/_temporal.py +++ b/torchvision/transforms/v2/_temporal.py @@ -7,6 +7,19 @@ class UniformTemporalSubsample(Transform): + """[BETA] Uniformly subsample ``num_samples`` indices from the temporal dimension of the video. + + .. v2betastatus:: UniformTemporalSubsample transform + + Videos are expected to be of shape ``[..., T, C, H, W]`` where ``T`` denotes the temporal dimension. + + When ``num_samples`` is larger than the size of temporal dimension of the video, it + will sample frames based on nearest neighbor interpolation. + + Args: + num_samples (int): The number of equispaced samples to be selected + """ + _transformed_types = (is_simple_tensor, datapoints.Video) def __init__(self, num_samples: int): diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py index 984d5ba50c0..60f44c5d3db 100644 --- a/torchvision/transforms/v2/_type_conversion.py +++ b/torchvision/transforms/v2/_type_conversion.py @@ -11,6 +11,15 @@ class PILToTensor(Transform): + """[BETA] Convert a PIL Image to a tensor of the same type - this does not scale values. + + .. v2betastatus:: PILToTensor transform + + This transform does not support torchscript. + + Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W). + """ + _transformed_types = (PIL.Image.Image,) def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Tensor: @@ -18,6 +27,14 @@ def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Ten class ToImageTensor(Transform): + """[BETA] Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.datapoints.Image` + ; this does not scale values. + + .. v2betastatus:: ToImageTensor transform + + This transform does not support torchscript. + """ + _transformed_types = (is_simple_tensor, PIL.Image.Image, np.ndarray) def _transform( @@ -27,6 +44,27 @@ def _transform( class ToImagePIL(Transform): + """[BETA] Convert a tensor or an ndarray to PIL Image - this does not scale values. + + .. v2betastatus:: ToImagePIL transform + + This transform does not support torchscript. + + Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape + H x W x C to a PIL Image while preserving the value range. + + Args: + mode (`PIL.Image mode`_): color space and pixel depth of input data (optional). + If ``mode`` is ``None`` (default) there are some assumptions made about the input data: + - If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``. + - If the input has 3 channels, the ``mode`` is assumed to be ``RGB``. + - If the input has 2 channels, the ``mode`` is assumed to be ``LA``. + - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``, + ``short``). + + .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes + """ + _transformed_types = (is_simple_tensor, datapoints.Image, np.ndarray) def __init__(self, mode: Optional[str] = None) -> None: