diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml
index 19521cdd011..8a9f7d33b49 100644
--- a/.github/workflows/test-linux-cpu.yml
+++ b/.github/workflows/test-linux-cpu.yml
@@ -39,7 +39,7 @@ jobs:
         fi
 
         # Create Conda Env
-        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy
+        conda create -yp ci_env --quiet python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3'
         conda activate /work/ci_env
 
         # Install PyTorch, Torchvision, and testing libraries
@@ -50,8 +50,8 @@ jobs:
           -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \
           "${CUDATOOLKIT}"
         python3 setup.py develop
-        python3 -m pip install pytest pytest-mock 'av<10'
+        python3 -m pip install --progress-bar=off pytest pytest-mock 'av<10'
 
         # Run Tests
         python3 -m torch.utils.collect_env
-        python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20
+        python3 -m pytest --junitxml=test-results/junit.xml --durations 20
diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml
index 831de27e350..d1275071bf7 100644
--- a/.github/workflows/test-linux-gpu.yml
+++ b/.github/workflows/test-linux-gpu.yml
@@ -43,7 +43,7 @@ jobs:
         fi
 
         # Create Conda Env
-        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy
+        conda create -yp ci_env --quiet python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3'
         conda activate /work/ci_env
 
         # Install PyTorch, Torchvision, and testing libraries
@@ -54,8 +54,8 @@ jobs:
           -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \
           "${CUDATOOLKIT}"
         python3 setup.py develop
-        python3 -m pip install pytest pytest-mock 'av<10'
+        python3 -m pip install --progress-bar=off pytest pytest-mock 'av<10'
 
         # Run Tests
         python3 -m torch.utils.collect_env
-        python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20
+        python3 -m pytest --junitxml=test-results/junit.xml --durations 20
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 09a11359ae7..2a50d9b8f45 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -5,3 +5,4 @@ sphinx-gallery>=0.11.1
 sphinx==5.0.0
 tabulate
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+pycocotools
diff --git a/docs/source/beta_status.py b/docs/source/beta_status.py
index 925894df5c5..4a0fdc72c0f 100644
--- a/docs/source/beta_status.py
+++ b/docs/source/beta_status.py
@@ -4,15 +4,26 @@
 
 class BetaStatus(Directive):
     has_content = True
+    text = "The {api_name} is in Beta stage, and backward compatibility is not guaranteed."
 
     def run(self):
-        api_name = " ".join(self.content)
-        text = f"The {api_name} is in Beta stage, and backward compatibility is not guaranteed."
+        text = self.text.format(api_name=" ".join(self.content))
         return [nodes.warning("", nodes.paragraph("", "", nodes.Text(text)))]
 
 
+class V2BetaStatus(BetaStatus):
+    text = (
+        "The {api_name} is in Beta stage, and while we do not expect major breaking changes, "
+        "some APIs may still change according to user feedback. Please submit any feedback you may have "
+        "in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check "
+        "out https://github.com/pytorch/vision/issues/7319 to learn "
+        "more about the APIs that we suspect might involve future changes."
+    )
+
+
 def setup(app):
     app.add_directive("betastatus", BetaStatus)
+    app.add_directive("v2betastatus", V2BetaStatus)
     return {
         "version": "0.1",
         "parallel_read_safe": True,
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 72c83d7893d..6d748f5b717 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -33,6 +33,9 @@
 
 sys.path.append(os.path.abspath("."))
 
+torchvision.disable_beta_transforms_warning()
+import torchvision.datapoints  # Don't remove, otherwise the docs for datapoints aren't linked properly
+
 # -- General configuration ------------------------------------------------
 
 # Required version of sphinx is set from docs/requirements.txt
@@ -60,6 +63,7 @@
     "gallery_dirs": "auto_examples",  # path to where to save gallery generated output
     "backreferences_dir": "gen_modules/backreferences",
     "doc_module": ("torchvision",),
+    "remove_config_comments": True,
 }
 
 napoleon_use_ivar = True
diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst
new file mode 100644
index 00000000000..1cc62413e66
--- /dev/null
+++ b/docs/source/datapoints.rst
@@ -0,0 +1,19 @@
+Datapoints
+==========
+
+.. currentmodule:: torchvision.datapoints
+
+Datapoints are tensor subclasses which the :mod:`~torchvision.transforms.v2` v2 transforms use under the hood to
+dispatch their inputs to the appropriate lower-level kernels. Most users do not
+need to manipulate datapoints directly and can simply rely on dataset wrapping -
+see e.g. :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    Image
+    Video
+    BoundingBoxFormat
+    BoundingBox
+    Mask
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 79dbebdd047..bc38fdb0307 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -32,6 +32,7 @@ architectures, and common image transformations for computer vision.
    :caption: Package Reference
 
    transforms
+   datapoints
    models
    datasets
    utils
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 5909b68966b..0d6961bbe79 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -5,6 +5,22 @@ Transforming and augmenting images
 
 .. currentmodule:: torchvision.transforms
 
+
+.. note::
+    In 0.15, we released a new set of transforms available in the
+    ``torchvision.transforms.v2`` namespace, which add support for transforming
+    not just images but also bounding boxes, masks, or videos. These transforms
+    are fully backward compatible with the current ones, and you'll see them
+    documented below with a `v2.` prefix. To get started with those new
+    transforms, you can check out
+    :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
+    Note that these transforms are still BETA, and while we don't expect major
+    breaking changes in the future, some APIs may still change according to user
+    feedback. Please submit any feedback you may have `here
+    <https://github.com/pytorch/vision/issues/6753>`_, and you can also check
+    out `this issue <https://github.com/pytorch/vision/issues/7319>`_ to learn
+    more about the APIs that we suspect might involve future changes.
+
 Transforms are common image transformations available in the
 ``torchvision.transforms`` module. They can be chained together using
 :class:`Compose`.
@@ -14,11 +30,10 @@ transformations.
 This is useful if you have to build a more complex transformation pipeline
 (e.g. in the case of segmentation tasks).
 
-Most transformations accept both `PIL <https://pillow.readthedocs.io>`_
-images and tensor images, although some transformations are :ref:`PIL-only
-<transforms_pil_only>` and some are :ref:`tensor-only
-<transforms_tensor_only>`. The :ref:`conversion_transforms` may be used to
-convert to and from PIL images.
+Most transformations accept both `PIL <https://pillow.readthedocs.io>`_ images
+and tensor images, although some transformations are PIL-only and some are
+tensor-only. The :ref:`conversion_transforms` may be used to convert to and from
+PIL images, or for converting dtypes and ranges.
 
 The transformations that accept tensor images also accept batches of tensor
 images. A Tensor Image is a tensor with ``(C, H, W)`` shape, where ``C`` is a
@@ -70,8 +85,10 @@ The following examples illustrate the use of the available transforms:
     produce the same results.
 
 
-Scriptable transforms
----------------------
+Transforms scriptability
+------------------------
+
+.. TODO: Add note about v2 scriptability (in next PR)
 
 In order to script the transformations, please use ``torch.nn.Sequential`` instead of :class:`Compose`.
 
@@ -89,100 +106,141 @@ Make sure to use only scriptable transformations, i.e. that work with ``torch.Te
 For any custom transformations to be used with ``torch.jit.script``, they should be derived from ``torch.nn.Module``.
 
 
-Compositions of transforms
---------------------------
+Geometry
+--------
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
-    Compose
+    Resize
+    v2.Resize
+    v2.ScaleJitter
+    v2.RandomShortestSize
+    v2.RandomResize
+    RandomCrop
+    v2.RandomCrop
+    RandomResizedCrop
+    v2.RandomResizedCrop
+    v2.RandomIoUCrop
+    CenterCrop
+    v2.CenterCrop
+    FiveCrop
+    v2.FiveCrop
+    TenCrop
+    v2.TenCrop
+    Pad
+    v2.Pad
+    v2.RandomZoomOut
+    RandomRotation
+    v2.RandomRotation
+    RandomAffine
+    v2.RandomAffine
+    RandomPerspective
+    v2.RandomPerspective
+    ElasticTransform
+    v2.ElasticTransform
+    RandomHorizontalFlip
+    v2.RandomHorizontalFlip
+    RandomVerticalFlip
+    v2.RandomVerticalFlip
 
 
-Transforms on PIL Image and torch.\*Tensor
-------------------------------------------
+Color
+-----
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
-    CenterCrop
     ColorJitter
-    FiveCrop
+    v2.ColorJitter
+    v2.RandomPhotometricDistort
     Grayscale
-    Pad
-    RandomAffine
-    RandomApply
-    RandomCrop
+    v2.Grayscale
     RandomGrayscale
-    RandomHorizontalFlip
-    RandomPerspective
-    RandomResizedCrop
-    RandomRotation
-    RandomVerticalFlip
-    Resize
-    TenCrop
+    v2.RandomGrayscale
     GaussianBlur
+    v2.GaussianBlur
     RandomInvert
+    v2.RandomInvert
     RandomPosterize
+    v2.RandomPosterize
     RandomSolarize
+    v2.RandomSolarize
     RandomAdjustSharpness
+    v2.RandomAdjustSharpness
     RandomAutocontrast
+    v2.RandomAutocontrast
     RandomEqualize
+    v2.RandomEqualize
 
-
-.. _transforms_pil_only:
-
-Transforms on PIL Image only
-----------------------------
+Composition
+-----------
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
+    Compose
+    v2.Compose
+    RandomApply
+    v2.RandomApply
     RandomChoice
+    v2.RandomChoice
     RandomOrder
+    v2.RandomOrder
 
-.. _transforms_tensor_only:
-
-Transforms on torch.\*Tensor only
----------------------------------
+Miscellaneous
+-------------
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
     LinearTransformation
+    v2.LinearTransformation
     Normalize
+    v2.Normalize
     RandomErasing
-    ConvertImageDtype
+    v2.RandomErasing
+    Lambda
+    v2.Lambda
+    v2.SanitizeBoundingBox
+    v2.ClampBoundingBox
+    v2.UniformTemporalSubsample
 
 .. _conversion_transforms:
 
-Conversion Transforms
----------------------
+Conversion
+----------
 
+.. note::
+    Beware, some of these conversion transforms below will scale the values
+    while performing the conversion, while some may not do any scaling. By
+    scaling, we mean e.g. that a ``uint8`` -> ``float32`` would map the [0,
+    255] range into [0, 1] (and vice-versa).
+    
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
     ToPILImage
+    v2.ToPILImage
+    v2.ToImagePIL
     ToTensor
+    v2.ToTensor
     PILToTensor
+    v2.PILToTensor
+    v2.ToImageTensor
+    ConvertImageDtype
+    v2.ConvertDtype
+    v2.ConvertImageDtype
+    v2.ToDtype
+    v2.ConvertBoundingBoxFormat
 
-
-Generic Transforms
-------------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: class.rst
-
-    Lambda
-
-
-Automatic Augmentation Transforms
----------------------------------
+Auto-Augmentation
+-----------------
 
 `AutoAugment <https://arxiv.org/pdf/1805.09501.pdf>`_ is a common Data Augmentation technique that can improve the accuracy of Image Classification models.
 Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that
@@ -196,9 +254,13 @@ The new transform can be used standalone or mixed-and-matched with existing tran
 
     AutoAugmentPolicy
     AutoAugment
+    v2.AutoAugment
     RandAugment
+    v2.RandAugment
     TrivialAugmentWide
+    v2.TrivialAugmentWide
     AugMix
+    v2.AugMix
 
 .. _functional_transforms:
 
@@ -207,6 +269,14 @@ Functional Transforms
 
 .. currentmodule:: torchvision.transforms.functional
 
+
+.. note::
+    You'll find below the documentation for the existing
+    ``torchvision.transforms.functional`` namespace. The
+    ``torchvision.transforms.v2.functional`` namespace exists as well and can be
+    used! The same functionals are present, so you simply need to change your
+    import to rely on the ``v2`` namespace.
+
 Functional transforms give you fine-grained control of the transformation pipeline.
 As opposed to the transformations above, functional transforms don't contain a random number
 generator for their parameters.
diff --git a/gallery/assets/coco/images/000000000001.jpg b/gallery/assets/coco/images/000000000001.jpg
new file mode 120000
index 00000000000..9be80c7c273
--- /dev/null
+++ b/gallery/assets/coco/images/000000000001.jpg
@@ -0,0 +1 @@
+../../astronaut.jpg
\ No newline at end of file
diff --git a/gallery/assets/coco/images/000000000002.jpg b/gallery/assets/coco/images/000000000002.jpg
new file mode 120000
index 00000000000..9f8efef9928
--- /dev/null
+++ b/gallery/assets/coco/images/000000000002.jpg
@@ -0,0 +1 @@
+../../dog2.jpg
\ No newline at end of file
diff --git a/gallery/assets/coco/instances.json b/gallery/assets/coco/instances.json
new file mode 100644
index 00000000000..fe0e09270bf
--- /dev/null
+++ b/gallery/assets/coco/instances.json
@@ -0,0 +1 @@
+{"images": [{"file_name": "000000000001.jpg", "height": 512, "width": 512, "id": 1}, {"file_name": "000000000002.jpg", "height": 500, "width": 500, "id": 2}], "annotations": [{"segmentation": [[40.0, 511.0, 26.0, 487.0, 28.0, 438.0, 17.0, 397.0, 24.0, 346.0, 38.0, 306.0, 61.0, 250.0, 111.0, 206.0, 111.0, 187.0, 120.0, 183.0, 136.0, 159.0, 159.0, 150.0, 181.0, 148.0, 182.0, 132.0, 175.0, 132.0, 168.0, 120.0, 154.0, 102.0, 153.0, 62.0, 188.0, 35.0, 191.0, 29.0, 208.0, 20.0, 210.0, 22.0, 227.0, 16.0, 240.0, 16.0, 276.0, 31.0, 285.0, 39.0, 301.0, 88.0, 297.0, 108.0, 281.0, 128.0, 273.0, 138.0, 266.0, 138.0, 264.0, 153.0, 257.0, 162.0, 256.0, 174.0, 284.0, 197.0, 300.0, 221.0, 303.0, 236.0, 337.0, 258.0, 357.0, 306.0, 361.0, 351.0, 358.0, 511.0]], "iscrowd": 0, "image_id": 1, "bbox": [17.0, 16.0, 344.0, 495.0], "category_id": 1, "id": 1}, {"segmentation": [[0.0, 411.0, 43.0, 401.0, 99.0, 395.0, 105.0, 351.0, 124.0, 326.0, 181.0, 294.0, 227.0, 280.0, 245.0, 262.0, 259.0, 234.0, 262.0, 207.0, 271.0, 140.0, 283.0, 139.0, 301.0, 162.0, 309.0, 181.0, 341.0, 175.0, 362.0, 139.0, 369.0, 139.0, 377.0, 163.0, 378.0, 203.0, 381.0, 212.0, 380.0, 220.0, 382.0, 242.0, 404.0, 264.0, 392.0, 293.0, 384.0, 295.0, 385.0, 316.0, 399.0, 343.0, 391.0, 448.0, 452.0, 475.0, 457.0, 494.0, 436.0, 498.0, 402.0, 491.0, 369.0, 488.0, 366.0, 496.0, 319.0, 496.0, 302.0, 485.0, 226.0, 469.0, 128.0, 456.0, 74.0, 458.0, 29.0, 439.0, 0.0, 445.0]], "iscrowd": 0, "image_id": 2, "bbox": [0.0, 139.0, 457.0, 359.0], "category_id": 18, "id": 2}]}
diff --git a/gallery/plot_datapoints.py b/gallery/plot_datapoints.py
new file mode 100644
index 00000000000..83ca6793598
--- /dev/null
+++ b/gallery/plot_datapoints.py
@@ -0,0 +1,132 @@
+"""
+==============
+Datapoints FAQ
+==============
+
+The :mod:`torchvision.datapoints` namespace was introduced together with ``torchvision.transforms.v2``. This example
+showcases what these datapoints are and how they behave. This is a fairly low-level topic that most users will not need
+to worry about: you do not need to understand the internals of datapoints to efficiently rely on
+``torchvision.transforms.v2``. It may however be useful for advanced users trying to implement their own datasets,
+transforms, or work directly with the datapoints.
+"""
+
+import PIL.Image
+
+import torch
+import torchvision
+
+# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
+# some APIs may slightly change in the future
+torchvision.disable_beta_transforms_warning()
+
+from torchvision import datapoints
+
+
+########################################################################################################################
+# What are datapoints?
+# --------------------
+#
+# Datapoints are zero-copy tensor subclasses:
+
+tensor = torch.rand(3, 256, 256)
+image = datapoints.Image(tensor)
+
+assert isinstance(image, torch.Tensor)
+assert image.data_ptr() == tensor.data_ptr()
+
+
+########################################################################################################################
+# Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
+# for the input data.
+#
+# What datapoints are supported?
+# ------------------------------
+#
+# So far :mod:`torchvision.datapoints` supports four types of datapoints:
+#
+# * :class:`~torchvision.datapoints.Image`
+# * :class:`~torchvision.datapoints.Video`
+# * :class:`~torchvision.datapoints.BoundingBox`
+# * :class:`~torchvision.datapoints.Mask`
+#
+# How do I construct a datapoint?
+# -------------------------------
+#
+# Each datapoint class takes any tensor-like data that can be turned into a :class:`~torch.Tensor`
+
+image = datapoints.Image([[[[0, 1], [1, 0]]]])
+print(image)
+
+
+########################################################################################################################
+# Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad``
+# parameters.
+
+float_image = datapoints.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires_grad=True)
+print(float_image)
+
+
+########################################################################################################################
+# In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` also take a
+# :class:`PIL.Image.Image` directly:
+
+image = datapoints.Image(PIL.Image.open("assets/astronaut.jpg"))
+print(image.shape, image.dtype)
+
+########################################################################################################################
+# In general, the datapoints can also store additional metadata that complements the underlying tensor. For example,
+# :class:`~torchvision.datapoints.BoundingBox` stores the coordinate format as well as the spatial size of the
+# corresponding image alongside the actual values:
+
+bounding_box = datapoints.BoundingBox(
+    [17, 16, 344, 495], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:]
+)
+print(bounding_box)
+
+
+########################################################################################################################
+# Do I have to wrap the output of the datasets myself?
+# ----------------------------------------------------
+#
+# Only if you are using custom datasets. For the built-in ones, you can use
+# :func:`torchvision.datasets.wrap_dataset_for_transforms_v2`. Note that the function also supports subclasses of the
+# built-in datasets. Meaning, if your custom dataset subclasses from a built-in one and the output type is the same, you
+# also don't have to wrap manually.
+#
+# How do the datapoints behave inside a computation?
+# --------------------------------------------------
+#
+# Datapoints look and feel just like regular tensors. Everything that is supported on a plain :class:`torch.Tensor`
+# also works on datapoints.
+# Since for most operations involving datapoints, it cannot be safely inferred whether the result should retain the
+# datapoint type, we choose to return a plain tensor instead of a datapoint (this might change, see note below):
+
+assert isinstance(image, datapoints.Image)
+
+new_image = image + 0
+
+assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image)
+
+########################################################################################################################
+# .. note::
+#
+#    This "unwrapping" behaviour is something we're actively seeking feedback on. If you find this surprising or if you
+#    have any suggestions on how to better support your use-cases, please reach out to us via this issue:
+#    https://github.com/pytorch/vision/issues/7319
+#
+# There are two exceptions to this rule:
+#
+# 1. The operations :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`, and :meth:`~torch.Tensor.requires_grad_`
+#    retain the datapoint type.
+# 2. Inplace operations on datapoints cannot change the type of the datapoint they are called on. However, if you use
+#    the flow style, the returned value will be unwrapped:
+
+image = datapoints.Image([[[0, 1], [1, 0]]])
+
+new_image = image.add_(1).mul_(2)
+
+assert isinstance(image, torch.Tensor)
+print(image)
+
+assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image)
+assert (new_image == image).all()
diff --git a/gallery/plot_transforms_v2.py b/gallery/plot_transforms_v2.py
new file mode 100644
index 00000000000..d1096bec1e7
--- /dev/null
+++ b/gallery/plot_transforms_v2.py
@@ -0,0 +1,109 @@
+"""
+==================================
+Getting started with transforms v2
+==================================
+
+Most computer vision tasks are not supported out of the box by ``torchvision.transforms`` v1, since it only supports
+images. ``torchvision.transforms.v2`` enables jointly transforming images, videos, bounding boxes, and masks. This
+example showcases the core functionality of the new ``torchvision.transforms.v2`` API.
+"""
+
+import pathlib
+
+import torch
+import torchvision
+
+
+def load_data():
+    from torchvision.io import read_image
+    from torchvision import datapoints
+    from torchvision.ops import masks_to_boxes
+
+    assets_directory = pathlib.Path("assets")
+
+    path = assets_directory / "FudanPed00054.png"
+    image = datapoints.Image(read_image(str(path)))
+    merged_masks = read_image(str(assets_directory / "FudanPed00054_mask.png"))
+
+    labels = torch.unique(merged_masks)[1:]
+
+    masks = datapoints.Mask(merged_masks == labels.view(-1, 1, 1))
+
+    bounding_boxes = datapoints.BoundingBox(
+        masks_to_boxes(masks), format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:]
+    )
+
+    return path, image, bounding_boxes, masks, labels
+
+
+########################################################################################################################
+# The :mod:`torchvision.transforms.v2` API supports images, videos, bounding boxes, and instance and segmentation
+# masks. Thus, it offers native support for many Computer Vision tasks, like image and video classification, object
+# detection or instance and semantic segmentation. Still, the interface is the same, making
+# :mod:`torchvision.transforms.v2` a drop-in replacement for the existing :mod:`torchvision.transforms` API, aka v1.
+
+# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
+# some APIs may slightly change in the future
+torchvision.disable_beta_transforms_warning()
+import torchvision.transforms.v2 as transforms
+
+transform = transforms.Compose(
+    [
+        transforms.ColorJitter(contrast=0.5),
+        transforms.RandomRotation(30),
+        transforms.CenterCrop(480),
+    ]
+)
+
+########################################################################################################################
+# :mod:`torchvision.transforms.v2` natively supports jointly transforming multiple inputs while making sure that
+# potential random behavior is consistent across all inputs. However, it doesn't enforce a specific input structure or
+# order.
+
+path, image, bounding_boxes, masks, labels = load_data()
+
+torch.manual_seed(0)
+new_image = transform(image)  # Image Classification
+new_image, new_bounding_boxes, new_labels = transform(image, bounding_boxes, labels)  # Object Detection
+new_image, new_bounding_boxes, new_masks, new_labels = transform(
+    image, bounding_boxes, masks, labels
+)  # Instance Segmentation
+new_image, new_target = transform((image, {"boxes": bounding_boxes, "labels": labels}))  # Arbitrary Structure
+
+########################################################################################################################
+# Under the hood, :mod:`torchvision.transforms.v2` relies on :mod:`torchvision.datapoints` for the dispatch to the
+# appropriate function for the input data: :ref:`sphx_glr_auto_examples_plot_datapoints.py`. Note however, that as
+# regular user, you likely don't have to touch this yourself. See
+# :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
+#
+# All "foreign" types like :class:`str`'s or :class:`pathlib.Path`'s are passed through, allowing to store extra
+# information directly with the sample:
+
+sample = {"path": path, "image": image}
+new_sample = transform(sample)
+
+assert new_sample["path"] is sample["path"]
+
+########################################################################################################################
+# As stated above, :mod:`torchvision.transforms.v2` is a drop-in replacement for :mod:`torchvision.transforms` and thus
+# also supports transforming plain :class:`torch.Tensor`'s as image or video if applicable. This is achieved with a
+# simple heuristic:
+#
+# * If we find an explicit image or video (:class:`torchvision.datapoints.Image`, :class:`torchvision.datapoints.Video`,
+#   or :class:`PIL.Image.Image`) in the input, all other plain tensors are passed through.
+# * If there is no explicit image or video, only the first plain :class:`torch.Tensor` will be transformed as image or
+#   video, while all others will be passed through.
+
+plain_tensor_image = torch.rand(image.shape)
+
+print(image.shape, plain_tensor_image.shape)
+
+# passing a plain tensor together with an explicit image, will not transform the former
+plain_tensor_image, image = transform(plain_tensor_image, image)
+
+print(image.shape, plain_tensor_image.shape)
+
+# passing a plain tensor without an explicit image, will transform the former
+plain_tensor_image, _ = transform(plain_tensor_image, bounding_boxes)
+
+print(image.shape, plain_tensor_image.shape)
diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py
new file mode 100644
index 00000000000..aa25d214f31
--- /dev/null
+++ b/gallery/plot_transforms_v2_e2e.py
@@ -0,0 +1,152 @@
+"""
+==================================================
+Transforms v2: End-to-end object detection example
+==================================================
+
+Object detection is not supported out of the box by ``torchvision.transforms`` v1, since it only supports images.
+``torchvision.transforms.v2`` enables jointly transforming images, videos, bounding boxes, and masks. This example
+showcases an end-to-end object detection training using the stable ``torchvisio.datasets`` and ``torchvision.models`` as
+well as the new ``torchvision.transforms.v2`` v2 API.
+"""
+
+import pathlib
+from collections import defaultdict
+
+import PIL.Image
+
+import torch
+import torch.utils.data
+
+import torchvision
+
+
+def show(sample):
+    import matplotlib.pyplot as plt
+
+    from torchvision.transforms.v2 import functional as F
+    from torchvision.utils import draw_bounding_boxes
+
+    image, target = sample
+    if isinstance(image, PIL.Image.Image):
+        image = F.to_image_tensor(image)
+    image = F.convert_dtype(image, torch.uint8)
+    annotated_image = draw_bounding_boxes(image, target["boxes"], colors="yellow", width=3)
+
+    fig, ax = plt.subplots()
+    ax.imshow(annotated_image.permute(1, 2, 0).numpy())
+    ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+    fig.tight_layout()
+
+    fig.show()
+
+
+# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
+# some APIs may slightly change in the future
+torchvision.disable_beta_transforms_warning()
+
+from torchvision import models, datasets
+import torchvision.transforms.v2 as transforms
+
+
+########################################################################################################################
+# We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently
+# returns, and we'll see how to convert it to a format that is compatible with our new transforms.
+
+
+def load_example_coco_detection_dataset(**kwargs):
+    # This loads fake data for illustration purposes of this example. In practice, you'll have
+    # to replace this with the proper data
+    root = pathlib.Path("assets") / "coco"
+    return datasets.CocoDetection(str(root / "images"), str(root / "instances.json"), **kwargs)
+
+
+dataset = load_example_coco_detection_dataset()
+
+sample = dataset[0]
+image, target = sample
+print(type(image))
+print(type(target), type(target[0]), list(target[0].keys()))
+
+
+########################################################################################################################
+# The dataset returns a two-tuple with the first item being a :class:`PIL.Image.Image` and second one a list of
+# dictionaries, which each containing the annotations for a single object instance. As is, this format is not compatible
+# with the ``torchvision.transforms.v2``, nor with the models. To overcome that, we provide the
+# :func:`~torchvision.datasets.wrap_dataset_for_transforms_v2` function. For
+# :class:`~torchvision.datasets.CocoDetection`, this changes the target structure to a single dictionary of lists. It
+# also adds the key-value-pairs ``"boxes"``, ``"masks"``, and ``"labels"`` wrapped in the corresponding
+# ``torchvision.datapoints``.
+
+dataset = datasets.wrap_dataset_for_transforms_v2(dataset)
+
+sample = dataset[0]
+image, target = sample
+print(type(image))
+print(type(target), list(target.keys()))
+print(type(target["boxes"]), type(target["masks"]), type(target["labels"]))
+
+########################################################################################################################
+# As baseline, let's have a look at a sample without transformations:
+
+show(sample)
+
+
+########################################################################################################################
+# With the dataset properly set up, we can now define the augmentation pipeline. This is done the same way it is done in
+# ``torchvision.transforms`` v1, but now handles bounding boxes and masks without any extra configuration.
+
+transform = transforms.Compose(
+    [
+        transforms.RandomPhotometricDistort(),
+        transforms.RandomZoomOut(
+            fill=defaultdict(lambda: 0, {PIL.Image.Image: (123, 117, 104)})
+        ),
+        transforms.RandomIoUCrop(),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToImageTensor(),
+        transforms.ConvertImageDtype(torch.float32),
+        transforms.SanitizeBoundingBox(),
+    ]
+)
+
+########################################################################################################################
+# .. note::
+#    Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBox` transform is a no-op in this example, but it
+#    should be placed at least once at the end of a detection pipeline to remove degenerate bounding boxes as well as
+#    the corresponding labels and optionally masks. It is particularly critical to add it if
+#    :class:`~torchvision.transforms.v2.RandomIoUCrop` was used.
+#
+# Let's look how the sample looks like with our augmentation pipeline in place:
+
+dataset = load_example_coco_detection_dataset(transforms=transform)
+dataset = datasets.wrap_dataset_for_transforms_v2(dataset)
+
+torch.manual_seed(3141)
+sample = dataset[0]
+
+# sphinx_gallery_thumbnail_number = 2
+show(sample)
+
+
+########################################################################################################################
+# We can see that the color of the image was distorted, we zoomed out on it (off center) and flipped it horizontally.
+# In all of this, the bounding box was transformed accordingly. And without any further ado, we can start training.
+
+data_loader = torch.utils.data.DataLoader(
+    dataset,
+    batch_size=2,
+    # We need a custom collation function here, since the object detection models expect a
+    # sequence of images and target dictionaries. The default collation function tries to
+    # `torch.stack` the individual elements, which fails in general for object detection,
+    # because the number of object instances varies between the samples. This is the same for
+    # `torchvision.transforms` v1
+    collate_fn=lambda batch: tuple(zip(*batch)),
+)
+
+model = models.get_model("ssd300_vgg16", weights=None, weights_backbone=None).train()
+
+for images, targets in data_loader:
+    loss_dict = model(images, targets)
+    print(loss_dict)
+    # Put your training logic here
+    break
diff --git a/test/test_datapoints.py b/test/test_datapoints.py
index 5b875a6ef20..39c05123333 100644
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
@@ -28,5 +28,5 @@ def test_bbox_instance(data, format):
     assert isinstance(bboxes, torch.Tensor)
     assert bboxes.ndim == 2 and bboxes.shape[1] == 4
     if isinstance(format, str):
-        format = datapoints.BoundingBoxFormat.from_str(format.upper())
+        format = datapoints.BoundingBoxFormat[(format.upper())]
     assert bboxes.format == format
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 2e43c86f91d..f5ca976963a 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -136,14 +136,14 @@ class TestSmoke:
             (transforms.RandomCrop([16, 16], pad_if_needed=True), None),
             (transforms.RandomHorizontalFlip(p=1.0), None),
             (transforms.RandomPerspective(p=1.0), None),
-            (transforms.RandomResize(min_size=10, max_size=20), None),
-            (transforms.RandomResizedCrop([16, 16]), None),
+            (transforms.RandomResize(min_size=10, max_size=20, antialias=True), None),
+            (transforms.RandomResizedCrop([16, 16], antialias=True), None),
             (transforms.RandomRotation(degrees=30), None),
-            (transforms.RandomShortestSize(min_size=10), None),
+            (transforms.RandomShortestSize(min_size=10, antialias=True), None),
             (transforms.RandomVerticalFlip(p=1.0), None),
             (transforms.RandomZoomOut(p=1.0), None),
             (transforms.Resize([16, 16], antialias=True), None),
-            (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2)), None),
+            (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2), antialias=True), None),
             (transforms.ClampBoundingBox(), None),
             (transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None),
             (transforms.ConvertDtype(), None),
@@ -275,7 +275,7 @@ def test_common(self, transform, adapter, container_type, image_or_video, device
                 boxes=datapoints.BoundingBox([[0, 0, 0, 0]], format=format, spatial_size=(224, 244)),
                 labels=torch.tensor([3]),
             )
-            assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4)
+            assert transforms.SanitizeBoundingBox()(sample)["boxes"].shape == (0, 4)
 
     @parametrize(
         [
@@ -1359,11 +1359,8 @@ def test_ctor(self, transform_cls, trfms):
 
 class TestRandomChoice:
     def test_assertions(self):
-        with pytest.warns(UserWarning, match="Argument p is deprecated and will be removed"):
-            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1, 2])
-
-        with pytest.raises(ValueError, match="The number of probabilities doesn't match the number of transforms"):
-            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], probabilities=[1])
+        with pytest.raises(ValueError, match="Length of p doesn't match the number of transforms"):
+            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1])
 
 
 class TestRandomIoUCrop:
@@ -1517,7 +1514,7 @@ class TestRandomShortestSize:
     def test__get_params(self, min_size, max_size, mocker):
         spatial_size = (3, 10)
 
-        transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size)
+        transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size, antialias=True)
 
         sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size)
         params = transform._get_params([sample])
@@ -1598,7 +1595,7 @@ def test__get_params(self):
         min_size = 3
         max_size = 6
 
-        transform = transforms.RandomResize(min_size=min_size, max_size=max_size)
+        transform = transforms.RandomResize(min_size=min_size, max_size=max_size, antialias=True)
 
         for _ in range(10):
             params = transform._get_params([])
@@ -1794,15 +1791,21 @@ def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
     else:
         sample = image, label
 
+    if to_tensor is transforms.ToTensor:
+        with pytest.warns(UserWarning, match="deprecated and will be removed"):
+            to_tensor = to_tensor()
+    else:
+        to_tensor = to_tensor()
+
     t = transforms.Compose(
         [
-            transforms.RandomResizedCrop((224, 224)),
+            transforms.RandomResizedCrop((224, 224), antialias=True),
             transforms.RandomHorizontalFlip(p=1),
             transforms.RandAugment(),
             transforms.TrivialAugmentWide(),
             transforms.AugMix(),
             transforms.AutoAugment(),
-            to_tensor(),
+            to_tensor,
             # TODO: ConvertImageDtype is a pass-through on PIL images, is that
             # intended?  This results in a failure if we convert to tensor after
             # it, because the image would still be uint8 which make Normalize
@@ -1833,10 +1836,17 @@ def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
 @pytest.mark.parametrize("sanitize", (True, False))
 def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
     torch.manual_seed(0)
+
+    if to_tensor is transforms.ToTensor:
+        with pytest.warns(UserWarning, match="deprecated and will be removed"):
+            to_tensor = to_tensor()
+    else:
+        to_tensor = to_tensor()
+
     if data_augmentation == "hflip":
         t = [
             transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
+            to_tensor,
             transforms.ConvertImageDtype(torch.float),
         ]
     elif data_augmentation == "lsj":
@@ -1850,7 +1860,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
             # ),
             transforms.RandomCrop((1024, 1024), pad_if_needed=True),
             transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
+            to_tensor,
             transforms.ConvertImageDtype(torch.float),
         ]
     elif data_augmentation == "multiscale":
@@ -1859,7 +1869,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
                 min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333, antialias=True
             ),
             transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
+            to_tensor,
             transforms.ConvertImageDtype(torch.float),
         ]
     elif data_augmentation == "ssd":
@@ -1868,18 +1878,18 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
             transforms.RandomZoomOut(fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})),
             transforms.RandomIoUCrop(),
             transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
+            to_tensor,
             transforms.ConvertImageDtype(torch.float),
         ]
     elif data_augmentation == "ssdlite":
         t = [
             transforms.RandomIoUCrop(),
             transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
+            to_tensor,
             transforms.ConvertImageDtype(torch.float),
         ]
     if sanitize:
-        t += [transforms.SanitizeBoundingBoxes()]
+        t += [transforms.SanitizeBoundingBox()]
     t = transforms.Compose(t)
 
     num_boxes = 5
@@ -1910,7 +1920,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
 
     out = t(sample)
 
-    if to_tensor is transforms.ToTensor and image_type is not datapoints.Image:
+    if isinstance(to_tensor, transforms.ToTensor) and image_type is not datapoints.Image:
         assert is_simple_tensor(out["image"])
     else:
         assert isinstance(out["image"], datapoints.Image)
@@ -1920,7 +1930,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
         # ssd and ssdlite contain RandomIoUCrop which may "remove" some bbox. It
         # doesn't remove them strictly speaking, it just marks some boxes as
         # degenerate and those boxes will be later removed by
-        # SanitizeBoundingBoxes(), which we add to the pipelines if the sanitize
+        # SanitizeBoundingBox(), which we add to the pipelines if the sanitize
         # param is True.
         # Note that the values below are probably specific to the random seed
         # set above (which is fine).
@@ -1935,7 +1945,14 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
 @pytest.mark.parametrize(
     "labels_getter", ("default", "labels", lambda inputs: inputs["labels"], None, lambda inputs: None)
 )
-def test_sanitize_bounding_boxes(min_size, labels_getter):
+@pytest.mark.parametrize("sample_type", (tuple, dict))
+def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
+
+    if sample_type is tuple and not isinstance(labels_getter, str):
+        # The "lambda inputs: inputs["labels"]" labels_getter used in this test
+        # doesn't work if the input is a tuple.
+        return
+
     H, W = 256, 128
 
     boxes_and_validity = [
@@ -1970,41 +1987,62 @@ def test_sanitize_bounding_boxes(min_size, labels_getter):
     )
 
     masks = datapoints.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W)))
-
+    whatever = torch.rand(10)
+    input_img = torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8)
     sample = {
-        "image": torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8),
+        "image": input_img,
         "labels": labels,
         "boxes": boxes,
-        "whatever": torch.rand(10),
+        "whatever": whatever,
         "None": None,
         "masks": masks,
     }
 
-    out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample)
+    if sample_type is tuple:
+        img = sample.pop("image")
+        sample = (img, sample)
+
+    out = transforms.SanitizeBoundingBox(min_size=min_size, labels_getter=labels_getter)(sample)
+
+    if sample_type is tuple:
+        out_image = out[0]
+        out_labels = out[1]["labels"]
+        out_boxes = out[1]["boxes"]
+        out_masks = out[1]["masks"]
+        out_whatever = out[1]["whatever"]
+    else:
+        out_image = out["image"]
+        out_labels = out["labels"]
+        out_boxes = out["boxes"]
+        out_masks = out["masks"]
+        out_whatever = out["whatever"]
 
-    assert out["image"] is sample["image"]
-    assert out["whatever"] is sample["whatever"]
+    assert out_image is input_img
+    assert out_whatever is whatever
 
     if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None):
-        assert out["labels"] is sample["labels"]
+        assert out_labels is labels
     else:
-        assert isinstance(out["labels"], torch.Tensor)
-        assert out["boxes"].shape[0] == out["labels"].shape[0] == out["masks"].shape[0]
+        assert isinstance(out_labels, torch.Tensor)
+        assert out_boxes.shape[0] == out_labels.shape[0] == out_masks.shape[0]
         # This works because we conveniently set labels to arange(num_boxes)
-        assert out["labels"].tolist() == valid_indices
+        assert out_labels.tolist() == valid_indices
 
 
 @pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT"))
-def test_sanitize_bounding_boxes_default_heuristic(key):
+@pytest.mark.parametrize("sample_type", (tuple, dict))
+def test_sanitize_bounding_boxes_default_heuristic(key, sample_type):
     labels = torch.arange(10)
-    d = {key: labels}
-    assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels
+    sample = {key: labels, "another_key": "whatever"}
+    if sample_type is tuple:
+        sample = (None, sample, "whatever_again")
+    assert transforms.SanitizeBoundingBox._find_labels_default_heuristic(sample) is labels
 
     if key.lower() != "labels":
         # If "labels" is in the dict (case-insensitive),
         # it takes precedence over other keys which would otherwise be a match
         d = {key: "something_else", "labels": labels}
-        assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels
+        assert transforms.SanitizeBoundingBox._find_labels_default_heuristic(d) is labels
 
 
 def test_sanitize_bounding_boxes_errors():
@@ -2016,25 +2054,25 @@ def test_sanitize_bounding_boxes_errors():
     )
 
     with pytest.raises(ValueError, match="min_size must be >= 1"):
-        transforms.SanitizeBoundingBoxes(min_size=0)
+        transforms.SanitizeBoundingBox(min_size=0)
     with pytest.raises(ValueError, match="labels_getter should either be a str"):
-        transforms.SanitizeBoundingBoxes(labels_getter=12)
+        transforms.SanitizeBoundingBox(labels_getter=12)
 
     with pytest.raises(ValueError, match="Could not infer where the labels are"):
         bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])}
-        transforms.SanitizeBoundingBoxes()(bad_labels_key)
+        transforms.SanitizeBoundingBox()(bad_labels_key)
 
     with pytest.raises(ValueError, match="If labels_getter is a str or 'default'"):
         not_a_dict = (good_bbox, torch.arange(good_bbox.shape[0]))
-        transforms.SanitizeBoundingBoxes()(not_a_dict)
+        transforms.SanitizeBoundingBox()(not_a_dict)
 
     with pytest.raises(ValueError, match="must be a tensor"):
         not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()}
-        transforms.SanitizeBoundingBoxes()(not_a_tensor)
+        transforms.SanitizeBoundingBox()(not_a_tensor)
 
     with pytest.raises(ValueError, match="Number of boxes"):
         different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)}
-        transforms.SanitizeBoundingBoxes()(different_sizes)
+        transforms.SanitizeBoundingBox()(different_sizes)
 
     with pytest.raises(ValueError, match="boxes must be of shape"):
         bad_bbox = datapoints.BoundingBox(  # batch with 2 elements
@@ -2046,7 +2084,7 @@ def test_sanitize_bounding_boxes_errors():
             spatial_size=(20, 20),
         )
         different_sizes = {"bbox": bad_bbox, "labels": torch.arange(bad_bbox.shape[0])}
-        transforms.SanitizeBoundingBoxes()(different_sizes)
+        transforms.SanitizeBoundingBox()(different_sizes)
 
 
 @pytest.mark.parametrize(
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 125d7ec7a3f..a8a87cd43dd 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -540,9 +540,12 @@ def test_signature_consistency(config):
             f"not. Please add a default value."
         )
 
-    legacy_kinds = {name: param.kind for name, param in legacy_params.items()}
-    prototype_kinds = {name: prototype_params[name].kind for name in legacy_kinds.keys()}
-    assert prototype_kinds == legacy_kinds
+    legacy_signature = list(legacy_params.keys())
+    # Since we made sure that we don't have any extra parameters without default above, we clamp the prototype signature
+    # to the same number of parameters as the legacy one
+    prototype_signature = list(prototype_params.keys())[: len(legacy_signature)]
+
+    assert prototype_signature == legacy_signature
 
 
 def check_call_consistency(
@@ -819,7 +822,7 @@ def test_random_choice(self, probabilities):
                 v2_transforms.Resize(256),
                 legacy_transforms.CenterCrop(224),
             ],
-            probabilities=probabilities,
+            p=probabilities,
         )
         legacy_transform = legacy_transforms.RandomChoice(
             [
@@ -1096,7 +1099,7 @@ def make_label(extra_dims, categories):
                 v2_transforms.Compose(
                     [
                         v2_transforms.RandomIoUCrop(),
-                        v2_transforms.SanitizeBoundingBoxes(labels_getter=lambda sample: sample[1]["labels"]),
+                        v2_transforms.SanitizeBoundingBox(labels_getter=lambda sample: sample[1]["labels"]),
                     ]
                 ),
                 {"with_mask": False},
diff --git a/torchvision/__init__.py b/torchvision/__init__.py
index f29da9cf644..eed24091a52 100644
--- a/torchvision/__init__.py
+++ b/torchvision/__init__.py
@@ -100,10 +100,11 @@ def _is_tracing():
 _WARN_ABOUT_BETA_TRANSFORMS = True
 _BETA_TRANSFORMS_WARNING = (
     "The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. "
-    "While we will try our best to maintain backward compatibility, "
-    "some APIs or behaviors might change without a deprecation cycle. "
-    "To help us improve these new features, please provide your feedback "
-    "here: https://github.com/pytorch/vision/issues/6753."
+    "While we do not expect major breaking changes, some APIs may still change "
+    "according to user feedback. Please submit any feedback you may have in "
+    "this issue: https://github.com/pytorch/vision/issues/6753, and you can also "
+    "check out https://github.com/pytorch/vision/issues/7319 to learn more about "
+    "the APIs that we suspect might involve future changes. "
     "You can silence this warning by calling torchvision.disable_beta_transform_warning()."
 )
 
diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index 1dc46f8f21a..11d42f171e4 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -1,21 +1,44 @@
 from __future__ import annotations
 
+from enum import Enum
 from typing import Any, List, Optional, Sequence, Tuple, Union
 
 import torch
-from torchvision._utils import StrEnum
 from torchvision.transforms import InterpolationMode  # TODO: this needs to be moved out of transforms
 
 from ._datapoint import _FillTypeJIT, Datapoint
 
 
-class BoundingBoxFormat(StrEnum):
-    XYXY = StrEnum.auto()
-    XYWH = StrEnum.auto()
-    CXCYWH = StrEnum.auto()
+class BoundingBoxFormat(Enum):
+    """[BETA] Coordinate format of a bounding box.
+
+    Available formats are
+
+    * ``XYXY``
+    * ``XYWH``
+    * ``CXCYWH``
+    """
+
+    XYXY = "XYXY"
+    XYWH = "XYWH"
+    CXCYWH = "CXCYWH"
 
 
 class BoundingBox(Datapoint):
+    """[BETA] :class:`torch.Tensor` subclass for bounding boxes.
+
+    Args:
+        data: Any data that can be turned into a tensor with :func:`torch.as_tensor`.
+        format (BoundingBoxFormat, str): Format of the bounding box.
+        spatial_size (two-tuple of ints): Height and width of the corresponding image or video.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
     format: BoundingBoxFormat
     spatial_size: Tuple[int, int]
 
@@ -39,7 +62,7 @@ def __new__(
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
 
         if isinstance(format, str):
-            format = BoundingBoxFormat.from_str(format.upper())
+            format = BoundingBoxFormat[format.upper()]
 
         return cls._wrap(tensor, format=format, spatial_size=spatial_size)
 
@@ -52,6 +75,20 @@ def wrap_like(
         format: Optional[BoundingBoxFormat] = None,
         spatial_size: Optional[Tuple[int, int]] = None,
     ) -> BoundingBox:
+        """Wrap a :class:`torch.Tensor` as :class:`BoundingBox` from a reference.
+
+        Args:
+            other (BoundingBox): Reference bounding box.
+            tensor (Tensor): Tensor to be wrapped as :class:`BoundingBox`
+            format (BoundingBoxFormat, str, optional): Format of the bounding box.  If omitted, it is taken from the
+                reference.
+            spatial_size (two-tuple of ints, optional): Height and width of the corresponding image or video. If
+                omitted, it is taken from the reference.
+
+        """
+        if isinstance(format, str):
+            format = BoundingBoxFormat[format.upper()]
+
         return cls._wrap(
             tensor,
             format=format if format is not None else other.format,
diff --git a/torchvision/datapoints/_image.py b/torchvision/datapoints/_image.py
index 21dfe5a5cd6..e47a6c10fc3 100644
--- a/torchvision/datapoints/_image.py
+++ b/torchvision/datapoints/_image.py
@@ -10,6 +10,19 @@
 
 
 class Image(Datapoint):
+    """[BETA] :class:`torch.Tensor` subclass for images.
+
+    Args:
+        data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
+            well as PIL images.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
     @classmethod
     def _wrap(cls, tensor: torch.Tensor) -> Image:
         image = tensor.as_subclass(cls)
diff --git a/torchvision/datapoints/_mask.py b/torchvision/datapoints/_mask.py
index bb70ec12224..0135d793d32 100644
--- a/torchvision/datapoints/_mask.py
+++ b/torchvision/datapoints/_mask.py
@@ -10,6 +10,19 @@
 
 
 class Mask(Datapoint):
+    """[BETA] :class:`torch.Tensor` subclass for segmentation and detection masks.
+
+    Args:
+        data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
+            well as PIL images.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
     @classmethod
     def _wrap(cls, tensor: torch.Tensor) -> Mask:
         return tensor.as_subclass(cls)
diff --git a/torchvision/datapoints/_video.py b/torchvision/datapoints/_video.py
index ab51c10233d..a6fbe2bd473 100644
--- a/torchvision/datapoints/_video.py
+++ b/torchvision/datapoints/_video.py
@@ -9,6 +9,18 @@
 
 
 class Video(Datapoint):
+    """[BETA] :class:`torch.Tensor` subclass for videos.
+
+    Args:
+        data (tensor-like): Any data that can be turned into a tensor with :func:`torch.as_tensor`.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
     @classmethod
     def _wrap(cls, tensor: torch.Tensor) -> Video:
         video = tensor.as_subclass(cls)
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 90cb0374eee..95eb9199ef3 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -105,7 +105,9 @@ def __repr__(self) -> str:
 
 
 class ToTensor:
-    """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor. This transform does not support torchscript.
+    """Convert a PIL Image or ndarray to tensor and scale the values accordingly.
+
+    This transform does not support torchscript.
 
     Converts a PIL Image or numpy.ndarray (H x W x C) in the range
     [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
@@ -139,7 +141,9 @@ def __repr__(self) -> str:
 
 
 class PILToTensor:
-    """Convert a ``PIL Image`` to a tensor of the same type. This transform does not support torchscript.
+    """Convert a PIL Image to a tensor of the same type - this does not scale values.
+
+    This transform does not support torchscript.
 
     Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W).
     """
@@ -166,7 +170,8 @@ def __repr__(self) -> str:
 
 
 class ConvertImageDtype(torch.nn.Module):
-    """Convert a tensor image to the given ``dtype`` and scale the values accordingly
+    """Convert a tensor image to the given ``dtype`` and scale the values accordingly.
+
     This function does not support PIL Image.
 
     Args:
@@ -194,7 +199,9 @@ def forward(self, image):
 
 
 class ToPILImage:
-    """Convert a tensor or an ndarray to PIL Image. This transform does not support torchscript.
+    """Convert a tensor or an ndarray to PIL Image - this does not scale values.
+
+    This transform does not support torchscript.
 
     Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
     H x W x C to a PIL Image while preserving the value range.
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
index 7ad72c00934..6573446a33a 100644
--- a/torchvision/transforms/v2/__init__.py
+++ b/torchvision/transforms/v2/__init__.py
@@ -40,7 +40,7 @@
     TenCrop,
 )
 from ._meta import ClampBoundingBox, ConvertBoundingBoxFormat, ConvertDtype, ConvertImageDtype
-from ._misc import GaussianBlur, Identity, Lambda, LinearTransformation, Normalize, SanitizeBoundingBoxes, ToDtype
+from ._misc import GaussianBlur, Identity, Lambda, LinearTransformation, Normalize, SanitizeBoundingBox, ToDtype
 from ._temporal import UniformTemporalSubsample
 from ._type_conversion import PILToTensor, ToImagePIL, ToImageTensor, ToPILImage
 
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 157605d6f3c..937e3508a87 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -13,6 +13,38 @@
 
 
 class RandomErasing(_RandomApplyTransform):
+    """[BETA] Randomly select a rectangle region in the input image or video and erase its pixels.
+
+    .. v2betastatus:: RandomErasing transform
+
+    This transform does not support PIL Image.
+    'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
+
+    Args:
+        p (float, optional): probability that the random erasing operation will be performed.
+        scale (tuple of float, optional): range of proportion of erased area against input image.
+        ratio (tuple of float, optional): range of aspect ratio of erased area.
+        value (number or tuple of numbers): erasing value. Default is 0. If a single int, it is used to
+            erase all pixels. If a tuple of length 3, it is used to erase
+            R, G, B channels respectively.
+            If a str of 'random', erasing each pixel with random values.
+        inplace (bool, optional): boolean to make this transform inplace. Default set to False.
+
+    Returns:
+        Erased input.
+
+    Example:
+        >>> from torchvision.transforms import v2 as transforms
+        >>>
+        >>> transform = transforms.Compose([
+        >>>   transforms.RandomHorizontalFlip(),
+        >>>   transforms.PILToTensor(),
+        >>>   transforms.ConvertImageDtype(torch.float),
+        >>>   transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>>   transforms.RandomErasing(),
+        >>> ])
+    """
+
     _v1_transform_cls = _transforms.RandomErasing
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index b4791755dc5..34c0ced43d2 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -162,6 +162,26 @@ def _apply_image_or_video_transform(
 
 
 class AutoAugment(_AutoAugmentBase):
+    r"""[BETA] AutoAugment data augmentation method based on
+    `"AutoAugment: Learning Augmentation Strategies from Data" <https://arxiv.org/pdf/1805.09501.pdf>`_.
+
+    .. v2betastatus:: AutoAugment transform
+
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        policy (AutoAugmentPolicy, optional): Desired policy enum defined by
+            :class:`torchvision.transforms.autoaugment.AutoAugmentPolicy`. Default is ``AutoAugmentPolicy.IMAGENET``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
     _v1_transform_cls = _transforms.AutoAugment
 
     _AUGMENTATION_SPACE = {
@@ -318,6 +338,29 @@ def forward(self, *inputs: Any) -> Any:
 
 
 class RandAugment(_AutoAugmentBase):
+    r"""[BETA] RandAugment data augmentation method based on
+    `"RandAugment: Practical automated data augmentation with a reduced search space"
+    <https://arxiv.org/abs/1909.13719>`_.
+
+    .. v2betastatus:: RandAugment transform
+
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        num_ops (int, optional): Number of augmentation transformations to apply sequentially.
+        magnitude (int, optional): Magnitude for all the transformations.
+        num_magnitude_bins (int, optional): The number of different magnitude values.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
     _v1_transform_cls = _transforms.RandAugment
     _AUGMENTATION_SPACE = {
         "Identity": (lambda num_bins, height, width: None, False),
@@ -379,6 +422,26 @@ def forward(self, *inputs: Any) -> Any:
 
 
 class TrivialAugmentWide(_AutoAugmentBase):
+    r"""[BETA] Dataset-independent data-augmentation with TrivialAugment Wide, as described in
+    `"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" <https://arxiv.org/abs/2103.10158>`_.
+
+    .. v2betastatus:: TrivialAugmentWide transform
+
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        num_magnitude_bins (int, optional): The number of different magnitude values.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
     _v1_transform_cls = _transforms.TrivialAugmentWide
     _AUGMENTATION_SPACE = {
         "Identity": (lambda num_bins, height, width: None, False),
@@ -430,6 +493,31 @@ def forward(self, *inputs: Any) -> Any:
 
 
 class AugMix(_AutoAugmentBase):
+    r"""[BETA] AugMix data augmentation method based on
+    `"AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty" <https://arxiv.org/abs/1912.02781>`_.
+
+    .. v2betastatus:: AugMix transform
+
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        severity (int, optional): The severity of base augmentation operators. Default is ``3``.
+        mixture_width (int, optional): The number of augmentation chains. Default is ``3``.
+        chain_depth (int, optional): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3].
+            Default is ``-1``.
+        alpha (float, optional): The hyperparameter for the probability distributions. Default is ``1.0``.
+        all_ops (bool, optional): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
     _v1_transform_cls = _transforms.AugMix
 
     _PARTIAL_AUGMENTATION_SPACE = {
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 64796e16ca4..4ad534c988b 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -11,6 +11,17 @@
 
 
 class Grayscale(Transform):
+    """[BETA] Convert images or videos to grayscale.
+
+    .. v2betastatus:: Grayscale transform
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 3 or 1, H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    Args:
+        num_output_channels (int): (1 or 3) number of channels desired for output image
+    """
+
     _v1_transform_cls = _transforms.Grayscale
 
     _transformed_types = (
@@ -29,6 +40,19 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomGrayscale(_RandomApplyTransform):
+    """[BETA] Randomly convert image or videos to grayscale with a probability of p (default 0.1).
+
+    .. v2betastatus:: RandomGrayscale transform
+
+    If the input is a :class:`torch.Tensor`, it is expected to have [..., 3 or 1, H, W] shape,
+    where ... means an arbitrary number of leading dimensions
+
+    The output has the same number of channels as the input.
+
+    Args:
+        p (float): probability that image should be converted to grayscale.
+    """
+
     _v1_transform_cls = _transforms.RandomGrayscale
 
     _transformed_types = (
@@ -50,6 +74,32 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class ColorJitter(Transform):
+    """[BETA] Randomly change the brightness, contrast, saturation and hue of an image or video.
+
+    .. v2betastatus:: ColorJitter transform
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported.
+
+    Args:
+        brightness (float or tuple of float (min, max)): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+            or the given [min, max]. Should be non negative numbers.
+        contrast (float or tuple of float (min, max)): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+            or the given [min, max]. Should be non-negative numbers.
+        saturation (float or tuple of float (min, max)): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+            or the given [min, max]. Should be non negative numbers.
+        hue (float or tuple of float (min, max)): How much to jitter hue.
+            hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
+            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+            To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space;
+            thus it does not work if you normalize your image to an interval with negative values,
+            or use an interpolation that generates negative values before using this function.
+    """
+
     _v1_transform_cls = _transforms.ColorJitter
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
@@ -129,6 +179,31 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 # TODO: This class seems to be untested
 class RandomPhotometricDistort(Transform):
+    """[BETA] Randomly distorts the image or video as used in `SSD: Single Shot
+    MultiBox Detector <https://arxiv.org/abs/1512.02325>`_.
+
+    .. v2betastatus:: RandomPhotometricDistort transform
+
+    This transform relies on :class:`~torchvision.transforms.v2.ColorJitter`
+    under the hood to adjust the contrast, saturation, hue, brightness, and also
+    randomly permutes channels.
+
+    Args:
+        brightness (tuple of float (min, max), optional): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [min, max]. Should be non negative numbers.
+        contrast tuple of float (min, max), optional): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [min, max]. Should be non-negative numbers.
+        saturation (tuple of float (min, max), optional): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [min, max]. Should be non negative numbers.
+        hue (tuple of float (min, max), optional): How much to jitter hue.
+            hue_factor is chosen uniformly from [min, max].  Should have -0.5 <= min <= max <= 0.5.
+            To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space;
+            thus it does not work if you normalize your image to an interval with negative values,
+            or use an interpolation that generates negative values before using this function.
+        p (float, optional) probability each distortion operation (contrast, saturation, ...) to be applied.
+            Default is 0.5.
+    """
+
     _transformed_types = (
         datapoints.Image,
         PIL.Image.Image,
@@ -138,10 +213,10 @@ class RandomPhotometricDistort(Transform):
 
     def __init__(
         self,
+        brightness: Tuple[float, float] = (0.875, 1.125),
         contrast: Tuple[float, float] = (0.5, 1.5),
         saturation: Tuple[float, float] = (0.5, 1.5),
         hue: Tuple[float, float] = (-0.05, 0.05),
-        brightness: Tuple[float, float] = (0.875, 1.125),
         p: float = 0.5,
     ):
         super().__init__()
@@ -205,6 +280,18 @@ def _transform(
 
 
 class RandomEqualize(_RandomApplyTransform):
+    """[BETA] Equalize the histogram of the given image or video with a given probability.
+
+    .. v2betastatus:: RandomEqualize transform
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "P", "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being equalized. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomEqualize
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -212,6 +299,18 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomInvert(_RandomApplyTransform):
+    """[BETA] Inverts the colors of the given image or video with a given probability.
+
+    .. v2betastatus:: RandomInvert transform
+
+    If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being color inverted. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomInvert
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -219,6 +318,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomPosterize(_RandomApplyTransform):
+    """[BETA] Posterize the image or video with a given probability by reducing the
+    number of bits for each color channel.
+
+    .. v2betastatus:: RandomPosterize transform
+
+    If the input is a :class:`torch.Tensor`, it should be of type torch.uint8,
+    and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        bits (int): number of bits to keep for each channel (0-8)
+        p (float): probability of the image being posterized. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomPosterize
 
     def __init__(self, bits: int, p: float = 0.5) -> None:
@@ -230,6 +343,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomSolarize(_RandomApplyTransform):
+    """[BETA] Solarize the image or video with a given probability by inverting all pixel
+    values above a threshold.
+
+    .. v2betastatus:: RandomSolarize transform
+
+    If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        threshold (float): all pixels equal or above this value are inverted.
+        p (float): probability of the image being solarized. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomSolarize
 
     def __init__(self, threshold: float, p: float = 0.5) -> None:
@@ -241,6 +368,18 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomAutocontrast(_RandomApplyTransform):
+    """[BETA] Autocontrast the pixels of the given image or video with a given probability.
+
+    .. v2betastatus:: RandomAutocontrast transform
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being autocontrasted. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomAutocontrast
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -248,6 +387,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomAdjustSharpness(_RandomApplyTransform):
+    """[BETA] Adjust the sharpness of the image or video with a given probability.
+
+    .. v2betastatus:: RandomAdjustSharpness transform
+
+    If the input is a :class:`torch.Tensor`,
+    it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        sharpness_factor (float):  How much to adjust the sharpness. Can be
+            any non-negative number. 0 gives a blurred image, 1 gives the
+            original image while 2 increases the sharpness by a factor of 2.
+        p (float): probability of the image being sharpened. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomAdjustSharpness
 
     def __init__(self, sharpness_factor: float, p: float = 0.5) -> None:
diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py
index 555010fda1e..fffef4157bd 100644
--- a/torchvision/transforms/v2/_container.py
+++ b/torchvision/transforms/v2/_container.py
@@ -1,4 +1,3 @@
-import warnings
 from typing import Any, Callable, Dict, List, Optional, Sequence, Union
 
 import torch
@@ -9,6 +8,37 @@
 
 
 class Compose(Transform):
+    """[BETA] Composes several transforms together.
+
+    .. v2betastatus:: Compose transform
+
+    This transform does not support torchscript.
+    Please, see the note below.
+
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+
+    Example:
+        >>> transforms.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.PILToTensor(),
+        >>>     transforms.ConvertImageDtype(torch.float),
+        >>> ])
+
+    .. note::
+        In order to script the transformations, please use ``torch.nn.Sequential`` as below.
+
+        >>> transforms = torch.nn.Sequential(
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>> )
+        >>> scripted_transforms = torch.jit.script(transforms)
+
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+
+    """
+
     def __init__(self, transforms: Sequence[Callable]) -> None:
         super().__init__()
         if not isinstance(transforms, Sequence):
@@ -29,6 +59,27 @@ def extra_repr(self) -> str:
 
 
 class RandomApply(Transform):
+    """[BETA] Apply randomly a list of transformations with a given probability.
+
+    .. v2betastatus:: RandomApply transform
+
+    .. note::
+        In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of
+        transforms as shown below:
+
+        >>> transforms = transforms.RandomApply(torch.nn.ModuleList([
+        >>>     transforms.ColorJitter(),
+        >>> ]), p=0.3)
+        >>> scripted_transforms = torch.jit.script(transforms)
+
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+        p (float): probability of applying the list of transforms
+    """
+
     _v1_transform_cls = _transforms.RandomApply
 
     def __init__(self, transforms: Union[Sequence[Callable], nn.ModuleList], p: float = 0.5) -> None:
@@ -63,42 +114,55 @@ def extra_repr(self) -> str:
 
 
 class RandomChoice(Transform):
+    """[BETA] Apply single transformation randomly picked from a list.
+
+    .. v2betastatus:: RandomChoice transform
+
+    This transform does not support torchscript.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+        p (list of floats or None, optional): probability of each transform being picked.
+            If ``p`` doesn't sum to 1, it is automatically normalized. If ``None``
+            (default), all transforms have the same probability.
+    """
+
     def __init__(
         self,
         transforms: Sequence[Callable],
-        probabilities: Optional[List[float]] = None,
         p: Optional[List[float]] = None,
     ) -> None:
         if not isinstance(transforms, Sequence):
             raise TypeError("Argument transforms should be a sequence of callables")
-        if p is not None:
-            warnings.warn(
-                "Argument p is deprecated and will be removed in a future release. "
-                "Please use probabilities argument instead."
-            )
-            probabilities = p
-
-        if probabilities is None:
-            probabilities = [1] * len(transforms)
-        elif len(probabilities) != len(transforms):
-            raise ValueError(
-                f"The number of probabilities doesn't match the number of transforms: "
-                f"{len(probabilities)} != {len(transforms)}"
-            )
+
+        if p is None:
+            p = [1] * len(transforms)
+        elif len(p) != len(transforms):
+            raise ValueError(f"Length of p doesn't match the number of transforms: {len(p)} != {len(transforms)}")
 
         super().__init__()
 
         self.transforms = transforms
-        total = sum(probabilities)
-        self.probabilities = [prob / total for prob in probabilities]
+        total = sum(p)
+        self.p = [prob / total for prob in p]
 
     def forward(self, *inputs: Any) -> Any:
-        idx = int(torch.multinomial(torch.tensor(self.probabilities), 1))
+        idx = int(torch.multinomial(torch.tensor(self.p), 1))
         transform = self.transforms[idx]
         return transform(*inputs)
 
 
 class RandomOrder(Transform):
+    """[BETA] Apply a list of transformations in a random order.
+
+    .. v2betastatus:: RandomOrder transform
+
+    This transform does not support torchscript.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+    """
+
     def __init__(self, transforms: Sequence[Callable]) -> None:
         if not isinstance(transforms, Sequence):
             raise TypeError("Argument transforms should be a sequence of callables")
diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py
index bfb0d06239f..e900e853d2b 100644
--- a/torchvision/transforms/v2/_deprecated.py
+++ b/torchvision/transforms/v2/_deprecated.py
@@ -10,6 +10,31 @@
 
 
 class ToTensor(Transform):
+    """[BETA] Convert a PIL Image or ndarray to tensor and scale the values accordingly.
+
+    .. v2betastatus:: ToTensor transform
+
+    .. warning::
+        :class:`v2.ToTensor` is deprecated and will be removed in a future release.
+        Please use instead ``transforms.Compose([transforms.ToImageTensor(), transforms.ConvertImageDtype()])``.
+
+    This transform does not support torchscript.
+
+
+    Converts a PIL Image or numpy.ndarray (H x W x C) in the range
+    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
+    if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1)
+    or if the numpy.ndarray has dtype = np.uint8
+
+    In the other cases, tensors are returned without scaling.
+
+    .. note::
+        Because the input image is scaled to [0.0, 1.0], this transformation should not be used when
+        transforming target image masks. See the `references`_ for implementing the transforms for image masks.
+
+    .. _references: https://github.com/pytorch/vision/tree/main/references/segmentation
+    """
+
     _transformed_types = (PIL.Image.Image, np.ndarray)
 
     def __init__(self) -> None:
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index f1eed87b9c0..59791c30b9d 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -26,6 +26,19 @@
 
 
 class RandomHorizontalFlip(_RandomApplyTransform):
+    """[BETA] Horizontally flip the input with a given probability.
+
+    .. v2betastatus:: RandomHorizontalFlip transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        p (float, optional): probability of the input being flipped. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomHorizontalFlip
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -33,6 +46,19 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomVerticalFlip(_RandomApplyTransform):
+    """[BETA] Vertically flip the input with a given probability.
+
+    .. v2betastatus:: RandomVerticalFlip transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        p (float, optional): probability of the input being flipped. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomVerticalFlip
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -40,6 +66,64 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class Resize(Transform):
+    """[BETA] Resize the input to the given size.
+
+    .. v2betastatus:: Resize transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    .. warning::
+        The output image might be different depending on its type: when downsampling, the interpolation of PIL images
+        and tensors is slightly different, because PIL applies antialiasing. This may lead to significant differences
+        in the performance of a network. Therefore, it is preferable to train and serve a model with the same input
+        types. See also below the ``antialias`` parameter, which can help making the output of PIL images and tensors
+        closer.
+
+    Args:
+        size (sequence or int): Desired output size. If size is a sequence like
+            (h, w), output size will be matched to this. If size is an int,
+            smaller edge of the image will be matched to this number.
+            i.e, if height > width, then image will be rescaled to
+            (size * height / width, size).
+
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        max_size (int, optional): The maximum allowed for the longer edge of
+            the resized image: if the longer edge of the image is greater
+            than ``max_size`` after being resized according to ``size``, then
+            the image is resized again so that the longer edge is equal to
+            ``max_size``. As a result, ``size`` might be overruled, i.e. the
+            smaller edge may be shorter than ``size``. This is only supported
+            if ``size`` is an int (or a sequence of length 1 in torchscript
+            mode).
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
     _v1_transform_cls = _transforms.Resize
 
     def __init__(
@@ -76,6 +160,23 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class CenterCrop(Transform):
+    """[BETA] Crop the input at the center.
+
+    .. v2betastatus:: CenterCrop transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+    """
+
     _v1_transform_cls = _transforms.CenterCrop
 
     def __init__(self, size: Union[int, Sequence[int]]):
@@ -87,6 +188,55 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomResizedCrop(Transform):
+    """[BETA] Crop a random portion of the input and resize it to a given size.
+
+    .. v2betastatus:: RandomResizedCrop transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    A crop of the original input is made: the crop has a random area (H * W)
+    and a random aspect ratio. This crop is finally resized to the given
+    size. This is popularly used to train the Inception networks.
+
+    Args:
+        size (int or sequence): expected output size of the crop, for each edge. If size is an
+            int instead of sequence like (h, w), a square output size ``(size, size)`` is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        scale (tuple of float, optional): Specifies the lower and upper bounds for the random area of the crop,
+            before resizing. The scale is defined with respect to the area of the original image.
+        ratio (tuple of float, optional): lower and upper bounds for the random aspect ratio of the crop, before
+            resizing.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
     _v1_transform_cls = _transforms.RandomResizedCrop
 
     def __init__(
@@ -164,7 +314,24 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class FiveCrop(Transform):
-    """
+    """[BETA] Crop the image or video into four corners and the central crop.
+
+    .. v2betastatus:: FiveCrop transform
+
+    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a
+    :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions.
+    For example, the image can have ``[..., C, H, W]`` shape.
+
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+
+    Args:
+         size (sequence or int): Desired output size of the crop. If size is an ``int``
+            instead of sequence like (h, w), a square crop of size (size, size) is made.
+            If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+
     Example:
         >>> class BatchMultiCrop(transforms.Transform):
         ...     def forward(self, sample: Tuple[Tuple[Union[datapoints.Image, datapoints.Video], ...], int]):
@@ -209,8 +376,27 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None:
 
 
 class TenCrop(Transform):
-    """
+    """[BETA] Crop the image or video into four corners and the central crop plus the flipped version of
+    these (horizontal flipping is used by default).
+
+    .. v2betastatus:: TenCrop transform
+
+    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a
+    :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions.
+    For example, the image can have ``[..., C, H, W]`` shape.
+
     See :class:`~torchvision.transforms.v2.FiveCrop` for an example.
+
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+        vertical_flip (bool, optional): Use vertical flipping instead of horizontal
     """
 
     _v1_transform_cls = _transforms.TenCrop
@@ -249,6 +435,45 @@ def _transform(
 
 
 class Pad(Transform):
+    """[BETA] Pad the input on all sides with the given "pad" value.
+
+    .. v2betastatus:: Pad transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        padding (int or sequence): Padding on each border. If a single int is provided this
+            is used to pad all borders. If sequence of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a sequence of length 4 is provided
+            this is the padding for the left, top, right and bottom borders respectively.
+
+            .. note::
+                In torchscript mode padding as single int is not supported, use a sequence of
+                length 1: ``[padding, ]``.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is "constant".
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value at the edge of the image.
+
+            - reflect: pads with reflection of image without repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+              will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+              will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
     _v1_transform_cls = _transforms.Pad
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
@@ -284,6 +509,37 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomZoomOut(_RandomApplyTransform):
+    """[BETA] "Zoom out" transformation from
+    `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
+
+    .. v2betastatus:: RandomZoomOut transform
+
+    This transformation randomly pads images, videos, bounding boxes and masks creating a zoom out effect.
+    Output spatial size is randomly sampled from original size up to a maximum size configured
+    with ``side_range`` parameter:
+
+    .. code-block:: python
+
+        r = uniform_sample(side_range[0], side_range[1])
+        output_width = input_width * r
+        output_height = input_height * r
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        side_range (sequence of floats, optional): tuple of two floats defines minimum and maximum factors to
+            scale the input size.
+        p (float, optional): probability of the input being flipped. Default value is 0.5
+    """
+
     def __init__(
         self,
         fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
@@ -323,6 +579,39 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomRotation(Transform):
+    """[BETA] Rotate the input by angle.
+
+    .. v2betastatus:: RandomRotation transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        degrees (sequence or number): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees).
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        expand (bool, optional): Optional expansion flag.
+            If true, expands the output to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center and no translation.
+        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
+            Default is the center of the image.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+
     _v1_transform_cls = _transforms.RandomRotation
 
     def __init__(
@@ -330,8 +619,8 @@ def __init__(
         degrees: Union[numbers.Number, Sequence],
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         expand: bool = False,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
         center: Optional[List[float]] = None,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
     ) -> None:
         super().__init__()
         self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
@@ -363,6 +652,47 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomAffine(Transform):
+    """[BETA] Random affine transformation the input keeping center invariant.
+
+    .. v2betastatus:: RandomAffine transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        degrees (sequence or number): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees). Set to 0 to deactivate rotations.
+        translate (tuple, optional): tuple of maximum absolute fraction for horizontal
+            and vertical translations. For example translate=(a, b), then horizontal shift
+            is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is
+            randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default.
+        scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
+            randomly sampled from the range a <= scale <= b. Will keep original scale by default.
+        shear (sequence or number, optional): Range of degrees to select from.
+            If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear)
+            will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the
+            range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
+            an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
+            Will not apply shear by default.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
+            Default is the center of the image.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+
     _v1_transform_cls = _transforms.RandomAffine
 
     def __init__(
@@ -443,6 +773,52 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomCrop(Transform):
+    """[BETA] Crop the input at a random location.
+
+    .. v2betastatus:: RandomCrop transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+        padding (int or sequence, optional): Optional padding on each border
+            of the image. Default is None. If a single int is provided this
+            is used to pad all borders. If sequence of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a sequence of length 4 is provided
+            this is the padding for the left, top, right and bottom borders respectively.
+
+            .. note::
+                In torchscript mode padding as single int is not supported, use a sequence of
+                length 1: ``[padding, ]``.
+        pad_if_needed (boolean, optional): It will pad the image if smaller than the
+            desired size to avoid raising an exception. Since cropping is done
+            after padding, the padding seems to be done at a random offset.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is constant.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value at the edge of the image.
+
+            - reflect: pads with reflection of image without repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+              will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+              will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
     _v1_transform_cls = _transforms.RandomCrop
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
@@ -552,14 +928,38 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomPerspective(_RandomApplyTransform):
+    """[BETA] Perform a random perspective transformation of the input with a given probability.
+
+    .. v2betastatus:: RandomPerspective transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        distortion_scale (float, optional): argument to control the degree of distortion and ranges from 0 to 1.
+            Default is 0.5.
+        p (float, optional): probability of the input being transformed. Default is 0.5.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+    """
+
     _v1_transform_cls = _transforms.RandomPerspective
 
     def __init__(
         self,
         distortion_scale: float = 0.5,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         p: float = 0.5,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
     ) -> None:
         super().__init__(p=p)
 
@@ -614,14 +1014,54 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class ElasticTransform(Transform):
+    """[BETA] Transform the input with elastic transformations.
+
+    .. v2betastatus:: RandomPerspective transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Given alpha and sigma, it will generate displacement
+    vectors for all pixels based on random offsets. Alpha controls the strength
+    and sigma controls the smoothness of the displacements.
+    The displacements are added to an identity grid and the resulting grid is
+    used to transform the input.
+
+    .. note::
+        Implementation to transform bounding boxes is approximative (not exact).
+        We construct an approximation of the inverse grid as ``inverse_grid = idenity - displacement``.
+        This is not an exact inverse of the grid used to transform images, i.e. ``grid = identity + displacement``.
+        Our assumption is that ``displacement * displacement`` is small and can be ignored.
+        Large displacements would lead to large errors in the approximation.
+
+    Applications:
+        Randomly transforms the morphology of objects in images and produces a
+        see-through-water-like effect.
+
+    Args:
+        alpha (float or sequence of floats, optional): Magnitude of displacements. Default is 50.0.
+        sigma (float or sequence of floats, optional): Smoothness of displacements. Default is 5.0.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+    """
+
     _v1_transform_cls = _transforms.ElasticTransform
 
     def __init__(
         self,
         alpha: Union[float, Sequence[float]] = 50.0,
         sigma: Union[float, Sequence[float]] = 5.0,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
     ) -> None:
         super().__init__()
         self.alpha = _setup_float_or_seq(alpha, "alpha", 2)
@@ -665,6 +1105,34 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomIoUCrop(Transform):
+    """[BETA] Random IoU crop transformation from
+    `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
+
+    .. v2betastatus:: RandomIoUCrop transform
+
+    This transformation requires an image or video data and ``datapoints.BoundingBox`` in the input.
+
+    .. warning::
+        In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop`
+        must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBox`, either immediately
+        after or later in the transforms pipeline.
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_scale (float, optional): Minimum factors to scale the input size.
+        max_scale (float, optional): Maximum factors to scale the input size.
+        min_aspect_ratio (float, optional): Minimum aspect ratio for the cropped image or video.
+        max_aspect_ratio (float, optional): Maximum aspect ratio for the cropped image or video.
+        sampler_options (list of float, optional): List of minimal IoU (Jaccard) overlap between all the boxes and
+            a cropped image or video. Default, ``None`` which corresponds to ``[0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]``
+        trials (int, optional): Number of trials to find a crop for a given value of minimal IoU (Jaccard) overlap.
+            Default, 40.
+    """
+
     def __init__(
         self,
         min_scale: float = 0.3,
@@ -754,13 +1222,52 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
         if isinstance(output, datapoints.BoundingBox):
             # We "mark" the invalid boxes as degenreate, and they can be
-            # removed by a later call to SanitizeBoundingBoxes()
+            # removed by a later call to SanitizeBoundingBox()
             output[~params["is_within_crop_area"]] = 0
 
         return output
 
 
 class ScaleJitter(Transform):
+    """[BETA] Perform Large Scale Jitter on the input according to
+    `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" <https://arxiv.org/abs/2012.07177>`_.
+
+    .. v2betastatus:: ScaleJitter transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        target_size (tuple of int): Target size. This parameter defines base scale for jittering,
+            e.g. ``min(target_size[0] / width, target_size[1] / height)``.
+        scale_range (tuple of float, optional): Minimum and maximum of the scale range. Default, ``(0.1, 2.0)``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
     def __init__(
         self,
         target_size: Tuple[int, int],
@@ -789,6 +1296,43 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomShortestSize(Transform):
+    """[BETA] Randomly resize the input.
+
+    .. v2betastatus:: RandomShortestSize transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_size (int or sequence of int): Minimum spatial size. Single integer value or a sequence of integer values.
+        max_size (int, optional): Maximum spatial size. Default, None.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
     def __init__(
         self,
         min_size: Union[List[int], Tuple[int], int],
@@ -820,6 +1364,54 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomResize(Transform):
+    """[BETA] Randomly resize the input.
+
+    .. v2betastatus:: RandomResize transform
+
+    This transformation can be used together with ``RandomCrop`` as data augmentations to train
+    models on image segmentation task.
+
+    Output spatial size is randomly sampled from the interval ``[min_size, max_size]``:
+
+    .. code-block:: python
+
+        size = uniform_sample(min_size, max_size)
+        output_width = size
+        output_height = size
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_size (int): Minimum output size for random sampling
+        max_size (int): Maximum output size for random sampling
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
     def __init__(
         self,
         min_size: int,
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index 0d1544094ca..b7e2a42259f 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -9,6 +9,16 @@
 
 
 class ConvertBoundingBoxFormat(Transform):
+    """[BETA] Convert bounding box coordinates to the given ``format``, eg from "CXCYWH" to "XYXY".
+
+    .. v2betastatus:: ConvertBoundingBoxFormat transform
+
+    Args:
+        format (str or datapoints.BoundingBoxFormat): output bounding box format.
+            Possible values are defined by :class:`~torchvision.datapoints.BoundingBoxFormat` and
+            string values match the enums, e.g. "XYXY" or "XYWH" etc.
+    """
+
     _transformed_types = (datapoints.BoundingBox,)
 
     def __init__(self, format: Union[str, datapoints.BoundingBoxFormat]) -> None:
@@ -22,6 +32,27 @@ def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> da
 
 
 class ConvertDtype(Transform):
+    """[BETA] Convert input image or video to the given ``dtype`` and scale the values accordingly.
+
+    .. v2betastatus:: ConvertDtype transform
+
+    This function does not support PIL Image.
+
+    Args:
+        dtype (torch.dtype): Desired data type of the output
+
+    .. note::
+
+        When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly.
+        If converted back and forth, this mismatch has no effect.
+
+    Raises:
+        RuntimeError: When trying to cast :class:`torch.float32` to :class:`torch.int32` or :class:`torch.int64` as
+            well as for trying to cast :class:`torch.float64` to :class:`torch.int64`. These conversions might lead to
+            overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range
+            of the integer ``dtype``.
+    """
+
     _v1_transform_cls = _transforms.ConvertImageDtype
 
     _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
@@ -42,6 +73,14 @@ def _transform(
 
 
 class ClampBoundingBox(Transform):
+    """[BETA] Clamp bounding boxes to their corresponding image dimensions.
+
+    The clamping is done according to the bounding boxes' ``spatial_size`` meta-data.
+
+    .. v2betastatus:: ClampBoundingBox transform
+
+    """
+
     _transformed_types = (datapoints.BoundingBox,)
 
     def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox:
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 6dd0755cfbb..c9b9025ebd9 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -1,7 +1,7 @@
 import collections
 import warnings
 from contextlib import suppress
-from typing import Any, Callable, cast, Dict, List, Optional, Sequence, Type, Union
+from typing import Any, Callable, cast, Dict, List, Mapping, Optional, Sequence, Type, Union
 
 import PIL.Image
 
@@ -15,12 +15,23 @@
 from .utils import has_any, is_simple_tensor, query_bounding_box
 
 
+# TODO: do we want/need to expose this?
 class Identity(Transform):
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         return inpt
 
 
 class Lambda(Transform):
+    """[BETA] Apply a user-defined function as a transform.
+
+    .. v2betastatus:: Lambda transform
+
+    This transform does not support torchscript.
+
+    Args:
+        lambd (function): Lambda/function to be used for transform.
+    """
+
     def __init__(self, lambd: Callable[[Any], Any], *types: Type):
         super().__init__()
         self.lambd = lambd
@@ -42,6 +53,26 @@ def extra_repr(self) -> str:
 
 
 class LinearTransformation(Transform):
+    """[BETA] Transform a tensor image or video with a square transformation matrix and a mean_vector computed offline.
+
+    .. v2betastatus:: LinearTransformation transform
+
+    This transform does not support PIL Image.
+    Given transformation_matrix and mean_vector, will flatten the torch.*Tensor and
+    subtract mean_vector from it which is then followed by computing the dot
+    product with the transformation matrix and then reshaping the tensor to its
+    original shape.
+
+    Applications:
+        whitening transformation: Suppose X is a column vector zero-centered data.
+        Then compute the data covariance matrix [D x D] with torch.mm(X.t(), X),
+        perform SVD on this matrix and pass it as transformation_matrix.
+
+    Args:
+        transformation_matrix (Tensor): tensor [D x D], D = C x H x W
+        mean_vector (Tensor): tensor [D], D = C x H x W
+    """
+
     _v1_transform_cls = _transforms.LinearTransformation
 
     _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
@@ -105,6 +136,26 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class Normalize(Transform):
+    """[BETA] Normalize a tensor image or video with mean and standard deviation.
+
+    .. v2betastatus:: Normalize transform
+
+    This transform does not support PIL Image.
+    Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
+    channels, this transform will normalize each channel of the input
+    ``torch.*Tensor`` i.e.,
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    .. note::
+        This transform acts out of place, i.e., it does not mutate the input tensor.
+
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+        inplace(bool,optional): Bool to make this operation in-place.
+
+    """
+
     _v1_transform_cls = _transforms.Normalize
     _transformed_types = (datapoints.Image, is_simple_tensor, datapoints.Video)
 
@@ -125,6 +176,21 @@ def _transform(
 
 
 class GaussianBlur(Transform):
+    """[BETA] Blurs image with randomly chosen Gaussian blur.
+
+    .. v2betastatus:: GausssianBlur transform
+
+    If the input is a Tensor, it is expected
+    to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        kernel_size (int or sequence): Size of the Gaussian kernel.
+        sigma (float or tuple of float (min, max)): Standard deviation to be used for
+            creating kernel to perform blurring. If float, sigma is fixed. If it is tuple
+            of float (min, max), sigma is chosen uniformly at random to lie in the
+            given range.
+    """
+
     _v1_transform_cls = _transforms.GaussianBlur
 
     def __init__(
@@ -157,6 +223,17 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class ToDtype(Transform):
+    """[BETA] Converts the input to a specific dtype - this does not scale values.
+
+    .. v2betastatus:: ToDtype transform
+
+    Args:
+        dtype (``torch.dtype`` or dict of ``Datapoint`` -> ``torch.dtype``): The dtype to convert to.
+            A dict can be passed to specify per-datapoint conversions, e.g.
+            ``dtype={datapoints.Image: torch.float32, datapoints.Video:
+            torch.float64}``.
+    """
+
     _transformed_types = (torch.Tensor,)
 
     def __init__(self, dtype: Union[torch.dtype, Dict[Type, Optional[torch.dtype]]]) -> None:
@@ -178,10 +255,34 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         return inpt.to(dtype=dtype)
 
 
-class SanitizeBoundingBoxes(Transform):
-    # This removes boxes and their corresponding labels:
-    # - small or degenerate bboxes based on min_size (this includes those where X2 <= X1 or Y2 <= Y1)
-    # - boxes with any coordinate outside the range of the image (negative, or > spatial_size)
+class SanitizeBoundingBox(Transform):
+    """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and masks.
+
+    .. v2betastatus:: SanitizeBoundingBox transform
+
+    This transform removes bounding boxes and their associated labels/masks that:
+
+    - are below a given ``min_size``: by default this also removes degenerate boxes that have e.g. X2 <= X1.
+    - have any coordinate outside of their corresponding image. You may want to
+      call :class:`~torchvision.transforms.v2.ClampBoundingBox` first to avoid undesired removals.
+
+    It is recommended to call it at the end of a pipeline, before passing the
+    input to the models. It is critical to call this transform if
+    :class:`~torchvision.transforms.v2.RandomIoUCrop` was called.
+    If you want to be extra careful, you may call it after all transforms that
+    may modify bounding boxes but once at the end should be enough in most
+    cases.
+
+    Args:
+        min_size (float, optional) The size below which bounding boxes are removed. Default is 1.
+        labels_getter (callable or str or None, optional): indicates how to identify the labels in the input.
+            It can be a str in which case the input is expected to be a dict, and ``labels_getter`` then specifies
+            the key whose value corresponds to the labels. It can also be a callable that takes the same input
+            as the transform, and returns the labels.
+            By default, this will try to find a "labels" key in the input, if
+            the input is a dict or it is a tuple whose second element is a dict.
+            This heuristic should work well with a lot of datasets, including the built-in torchvision datasets.
+    """
 
     def __init__(
         self,
@@ -201,7 +302,9 @@ def __init__(
         elif callable(labels_getter):
             self._labels_getter = labels_getter
         elif isinstance(labels_getter, str):
-            self._labels_getter = lambda inputs: inputs[labels_getter]
+            self._labels_getter = lambda inputs: SanitizeBoundingBox._get_dict_or_second_tuple_entry(inputs)[
+                labels_getter  # type: ignore[index]
+            ]
         elif labels_getter is None:
             self._labels_getter = None
         else:
@@ -210,10 +313,27 @@ def __init__(
                 f"Got {labels_getter} of type {type(labels_getter)}."
             )
 
+    @staticmethod
+    def _get_dict_or_second_tuple_entry(inputs: Any) -> Mapping[str, Any]:
+        # datasets outputs may be plain dicts like {"img": ..., "labels": ..., "bbox": ...}
+        # or tuples like (img, {"labels":..., "bbox": ...})
+        # This hacky helper accounts for both structures.
+        if isinstance(inputs, tuple):
+            inputs = inputs[1]
+
+        if not isinstance(inputs, collections.abc.Mapping):
+            raise ValueError(
+                f"If labels_getter is a str or 'default', "
+                f"then the input to forward() must be a dict or a tuple whose second element is a dict."
+                f" Got {type(inputs)} instead."
+            )
+        return inputs
+
     @staticmethod
     def _find_labels_default_heuristic(inputs: Dict[str, Any]) -> Optional[torch.Tensor]:
-        # Tries to find a "label" key, otherwise tries for the first key that contains "label" - case insensitive
+        # Tries to find a "labels" key, otherwise tries for the first key that contains "label" - case insensitive
         # Returns None if nothing is found
+        inputs = SanitizeBoundingBox._get_dict_or_second_tuple_entry(inputs)
         candidate_key = None
         with suppress(StopIteration):
             candidate_key = next(key for key in inputs.keys() if key.lower() == "labels")
@@ -230,12 +350,6 @@ def _find_labels_default_heuristic(inputs: Dict[str, Any]) -> Optional[torch.Ten
     def forward(self, *inputs: Any) -> Any:
         inputs = inputs if len(inputs) > 1 else inputs[0]
 
-        if isinstance(self.labels_getter, str) and not isinstance(inputs, collections.abc.Mapping):
-            raise ValueError(
-                f"If labels_getter is a str or 'default' (got {self.labels_getter}), "
-                f"then the input to forward() must be a dict. Got {type(inputs)} instead."
-            )
-
         if self._labels_getter is None:
             labels = None
         else:
diff --git a/torchvision/transforms/v2/_temporal.py b/torchvision/transforms/v2/_temporal.py
index b26d6b0450f..df4ad66643a 100644
--- a/torchvision/transforms/v2/_temporal.py
+++ b/torchvision/transforms/v2/_temporal.py
@@ -7,6 +7,19 @@
 
 
 class UniformTemporalSubsample(Transform):
+    """[BETA] Uniformly subsample ``num_samples`` indices from the temporal dimension of the video.
+
+    .. v2betastatus:: UniformTemporalSubsample transform
+
+    Videos are expected to be of shape ``[..., T, C, H, W]`` where ``T`` denotes the temporal dimension.
+
+    When ``num_samples`` is larger than the size of temporal dimension of the video, it
+    will sample frames based on nearest neighbor interpolation.
+
+    Args:
+        num_samples (int): The number of equispaced samples to be selected
+    """
+
     _transformed_types = (is_simple_tensor, datapoints.Video)
 
     def __init__(self, num_samples: int):
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index 984d5ba50c0..60f44c5d3db 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -11,6 +11,15 @@
 
 
 class PILToTensor(Transform):
+    """[BETA] Convert a PIL Image to a tensor of the same type - this does not scale values.
+
+    .. v2betastatus:: PILToTensor transform
+
+    This transform does not support torchscript.
+
+    Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W).
+    """
+
     _transformed_types = (PIL.Image.Image,)
 
     def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Tensor:
@@ -18,6 +27,14 @@ def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Ten
 
 
 class ToImageTensor(Transform):
+    """[BETA] Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.datapoints.Image`
+    ; this does not scale values.
+
+    .. v2betastatus:: ToImageTensor transform
+
+    This transform does not support torchscript.
+    """
+
     _transformed_types = (is_simple_tensor, PIL.Image.Image, np.ndarray)
 
     def _transform(
@@ -27,6 +44,27 @@ def _transform(
 
 
 class ToImagePIL(Transform):
+    """[BETA] Convert a tensor or an ndarray to PIL Image - this does not scale values.
+
+    .. v2betastatus:: ToImagePIL transform
+
+    This transform does not support torchscript.
+
+    Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
+    H x W x C to a PIL Image while preserving the value range.
+
+    Args:
+        mode (`PIL.Image mode`_): color space and pixel depth of input data (optional).
+            If ``mode`` is ``None`` (default) there are some assumptions made about the input data:
+            - If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``.
+            - If the input has 3 channels, the ``mode`` is assumed to be ``RGB``.
+            - If the input has 2 channels, the ``mode`` is assumed to be ``LA``.
+            - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``,
+            ``short``).
+
+    .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
+    """
+
     _transformed_types = (is_simple_tensor, datapoints.Image, np.ndarray)
 
     def __init__(self, mode: Optional[str] = None) -> None: