Skip to content

More rotated bboxes transforms #9095

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 15 additions & 8 deletions test/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,13 @@ def sample_position(values, max_value):
format = tv_tensors.BoundingBoxFormat[format]

dtype = dtype or torch.float32
int_dtype = dtype in (
torch.uint8,
torch.int8,
torch.int16,
torch.int32,
torch.int64,
)

h, w = (torch.randint(1, s, (num_boxes,)) for s in canvas_size)
y = sample_position(h, canvas_size[0])
Expand All @@ -443,17 +450,17 @@ def sample_position(values, max_value):
elif format is tv_tensors.BoundingBoxFormat.XYXYXYXY:
r_rad = r * torch.pi / 180.0
cos, sin = torch.cos(r_rad), torch.sin(r_rad)
x1, y1 = x, y
x2 = x1 + w * cos
y2 = y1 - w * sin
x3 = x2 + h * sin
y3 = y2 + h * cos
x4 = x1 + h * sin
y4 = y1 + h * cos
x1 = torch.round(x) if int_dtype else x
y1 = torch.round(y) if int_dtype else y
x2 = torch.round(x1 + w * cos) if int_dtype else x1 + w * cos
y2 = torch.round(y1 - w * sin) if int_dtype else y1 - w * sin
x3 = torch.round(x2 + h * sin) if int_dtype else x2 + h * sin
y3 = torch.round(y2 + h * cos) if int_dtype else y2 + h * cos
x4 = torch.round(x1 + h * sin) if int_dtype else x1 + h * sin
y4 = torch.round(y1 + h * cos) if int_dtype else y1 + h * cos
parts = (x1, y1, x2, y2, x3, y3, x4, y4)
else:
raise ValueError(f"Format {format} is not supported")

return tv_tensors.BoundingBoxes(
torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size
)
Expand Down
152 changes: 114 additions & 38 deletions test/test_transforms_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
from torchvision.transforms.functional import pil_modes_mapping, to_pil_image
from torchvision.transforms.v2 import functional as F
from torchvision.transforms.v2._utils import check_type, is_pure_tensor
from torchvision.transforms.v2.functional._geometry import _get_perspective_coeffs
from torchvision.transforms.v2.functional._geometry import _get_perspective_coeffs, _parallelogram_to_bounding_boxes
from torchvision.transforms.v2.functional._utils import _get_kernel, _register_kernel_internal


Expand Down Expand Up @@ -560,7 +560,9 @@ def affine_bounding_boxes(bounding_boxes):
)


def reference_affine_rotated_bounding_boxes_helper(bounding_boxes, *, affine_matrix, new_canvas_size=None, clamp=True):
def reference_affine_rotated_bounding_boxes_helper(
bounding_boxes, *, affine_matrix, new_canvas_size=None, clamp=True, flip=False
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you help me understand why we need a flip parameter in reference_affine_rotated_bounding_boxes_helper, but not in reference_affine_bounding_boxes_helper?

It seems that reference_affine_bounding_boxes_helper, the flip is done through the affine_matrix? Is this something we can do for the rotated case as well? If not it might be worth adding a comment to explain why

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@NicolasHug, this is a good question. As illustrated in the image below. When we apply the flip operation, we actually transform each points (x1, y1), (x2, y2), (x3, y3), (x4, y4) from the bounding box. However the flip operation will typically change the order of the points if we go through them in clock-wise order. Given the definition of "XYXYXYXY", "CXCYWHR", and "XYWHR", we do want to go through (x1, y1), (x2, y2), (x3, y3), (x4, y4) in clock-wise order. This is why we are switching the point (x2, y2) and (x4, y4). This typically does not append for non-rotated boxes as we re-assign the points with min and max operations here. I hope it makes sense?

image

):
format = bounding_boxes.format
canvas_size = new_canvas_size or bounding_boxes.canvas_size

Expand Down Expand Up @@ -588,21 +590,34 @@ def affine_rotated_bounding_boxes(bounding_boxes):
transformed_points = np.matmul(points, affine_matrix.astype(points.dtype).T)
output = torch.tensor(
[
float(transformed_points[1, 0]),
float(transformed_points[1, 1]),
float(transformed_points[0, 0]),
float(transformed_points[0, 1]),
float(transformed_points[3, 0]),
float(transformed_points[3, 1]),
float(transformed_points[1, 0]),
float(transformed_points[1, 1]),
float(transformed_points[2, 0]),
float(transformed_points[2, 1]),
float(transformed_points[3, 0]),
float(transformed_points[3, 1]),
]
)

output = output[[2, 3, 0, 1, 6, 7, 4, 5]] if flip else output
output = _parallelogram_to_bounding_boxes(output)

output = F.convert_bounding_box_format(
output, old_format=tv_tensors.BoundingBoxFormat.XYXYXYXY, new_format=format
)

if torch.is_floating_point(output) and dtype in (
torch.uint8,
torch.int8,
torch.int16,
torch.int32,
torch.int64,
):
# it is better to round before cast
output = torch.round(output)

if clamp:
# It is important to clamp before casting, especially for CXCYWHR format, dtype=int64
output = F.clamp_bounding_boxes(
Expand Down Expand Up @@ -707,7 +722,7 @@ def test_kernel_image(self, size, interpolation, use_max_size, antialias, dtype,
check_scripted_vs_eager=not isinstance(size, int),
)

@pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
@pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
@pytest.mark.parametrize("size", OUTPUT_SIZES)
@pytest.mark.parametrize("use_max_size", [True, False])
@pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
Expand All @@ -725,6 +740,7 @@ def test_kernel_bounding_boxes(self, format, size, use_max_size, dtype, device):
check_kernel(
F.resize_bounding_boxes,
bounding_boxes,
format=format,
canvas_size=bounding_boxes.canvas_size,
size=size,
**max_size_kwarg,
Expand Down Expand Up @@ -816,7 +832,7 @@ def test_image_correctness(self, size, interpolation, use_max_size, fn):
self._check_output_size(image, actual, size=size, **max_size_kwarg)
torch.testing.assert_close(actual, expected, atol=1, rtol=0)

def _reference_resize_bounding_boxes(self, bounding_boxes, *, size, max_size=None):
def _reference_resize_bounding_boxes(self, bounding_boxes, format, *, size, max_size=None):
old_height, old_width = bounding_boxes.canvas_size
new_height, new_width = self._compute_output_size(
input_size=bounding_boxes.canvas_size, size=size, max_size=max_size
Expand All @@ -832,13 +848,19 @@ def _reference_resize_bounding_boxes(self, bounding_boxes, *, size, max_size=Non
],
)

return reference_affine_bounding_boxes_helper(
helper = (
reference_affine_rotated_bounding_boxes_helper
if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
else reference_affine_bounding_boxes_helper
)

return helper(
bounding_boxes,
affine_matrix=affine_matrix,
new_canvas_size=(new_height, new_width),
)

@pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
@pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
@pytest.mark.parametrize("size", OUTPUT_SIZES)
@pytest.mark.parametrize("use_max_size", [True, False])
@pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)])
Expand All @@ -849,7 +871,7 @@ def test_bounding_boxes_correctness(self, format, size, use_max_size, fn):
bounding_boxes = make_bounding_boxes(format=format, canvas_size=self.INPUT_SIZE)

actual = fn(bounding_boxes, size=size, **max_size_kwarg)
expected = self._reference_resize_bounding_boxes(bounding_boxes, size=size, **max_size_kwarg)
expected = self._reference_resize_bounding_boxes(bounding_boxes, format=format, size=size, **max_size_kwarg)

self._check_output_size(bounding_boxes, actual, size=size, **max_size_kwarg)
torch.testing.assert_close(actual, expected)
Expand Down Expand Up @@ -1152,7 +1174,7 @@ def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes: tv_tensors.B
)

helper = (
reference_affine_rotated_bounding_boxes_helper
functools.partial(reference_affine_rotated_bounding_boxes_helper, flip=True)
if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
else reference_affine_bounding_boxes_helper
)
Expand Down Expand Up @@ -1257,7 +1279,7 @@ def test_kernel_image(self, param, value, dtype, device):
shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"],
center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"],
)
@pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
@pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
@pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
@pytest.mark.parametrize("device", cpu_and_cuda())
def test_kernel_bounding_boxes(self, param, value, format, dtype, device):
Expand Down Expand Up @@ -1399,14 +1421,22 @@ def _reference_affine_bounding_boxes(self, bounding_boxes, *, angle, translate,
if center is None:
center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]]

return reference_affine_bounding_boxes_helper(
affine_matrix = self._compute_affine_matrix(
angle=angle, translate=translate, scale=scale, shear=shear, center=center
)

helper = (
reference_affine_rotated_bounding_boxes_helper
if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
else reference_affine_bounding_boxes_helper
)

return helper(
bounding_boxes,
affine_matrix=self._compute_affine_matrix(
angle=angle, translate=translate, scale=scale, shear=shear, center=center
),
affine_matrix=affine_matrix,
)

@pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
@pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
@pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
@pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"])
@pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"])
Expand Down Expand Up @@ -1607,7 +1637,7 @@ def _reference_vertical_flip_bounding_boxes(self, bounding_boxes: tv_tensors.Bou
)

helper = (
reference_affine_rotated_bounding_boxes_helper
functools.partial(reference_affine_rotated_bounding_boxes_helper, flip=True)
if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
else reference_affine_bounding_boxes_helper
)
Expand Down Expand Up @@ -2914,7 +2944,7 @@ def test_kernel_image(self, kwargs, dtype, device):
check_kernel(F.crop_image, make_image(self.INPUT_SIZE, dtype=dtype, device=device), **kwargs)

@pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS)
@pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
@pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
@pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
@pytest.mark.parametrize("device", cpu_and_cuda())
def test_kernel_bounding_box(self, kwargs, format, dtype, device):
Expand Down Expand Up @@ -3059,12 +3089,15 @@ def _reference_crop_bounding_boxes(self, bounding_boxes, *, top, left, height, w
[0, 1, -top],
],
)
return reference_affine_bounding_boxes_helper(
bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=(height, width)
helper = (
reference_affine_rotated_bounding_boxes_helper
if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
else reference_affine_bounding_boxes_helper
)
return helper(bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=(height, width))

@pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS)
@pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
@pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
@pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
@pytest.mark.parametrize("device", cpu_and_cuda())
def test_functional_bounding_box_correctness(self, kwargs, format, dtype, device):
Expand All @@ -3077,7 +3110,7 @@ def test_functional_bounding_box_correctness(self, kwargs, format, dtype, device
assert_equal(F.get_size(actual), F.get_size(expected))

@pytest.mark.parametrize("output_size", [(17, 11), (11, 17), (11, 11)])
@pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
@pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
@pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
@pytest.mark.parametrize("device", cpu_and_cuda())
@pytest.mark.parametrize("seed", list(range(5)))
Expand All @@ -3099,7 +3132,7 @@ def test_transform_bounding_boxes_correctness(self, output_size, format, dtype,

expected = self._reference_crop_bounding_boxes(bounding_boxes, **params)

assert_equal(actual, expected)
torch.testing.assert_close(actual, expected)
assert_equal(F.get_size(actual), F.get_size(expected))

def test_errors(self):
Expand Down Expand Up @@ -3834,13 +3867,19 @@ def _reference_resized_crop_bounding_boxes(self, bounding_boxes, *, top, left, h
)
affine_matrix = (resize_affine_matrix @ crop_affine_matrix)[:2, :]

return reference_affine_bounding_boxes_helper(
helper = (
reference_affine_rotated_bounding_boxes_helper
if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
else reference_affine_bounding_boxes_helper
)

return helper(
bounding_boxes,
affine_matrix=affine_matrix,
new_canvas_size=size,
)

@pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
@pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
def test_functional_bounding_boxes_correctness(self, format):
bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format)

Expand All @@ -3849,7 +3888,7 @@ def test_functional_bounding_boxes_correctness(self, format):
bounding_boxes, **self.CROP_KWARGS, size=self.OUTPUT_SIZE
)

assert_equal(actual, expected)
torch.testing.assert_close(actual, expected)
assert_equal(F.get_size(actual), F.get_size(expected))

def test_transform_errors_warnings(self):
Expand Down Expand Up @@ -3914,7 +3953,7 @@ def test_kernel_image(self, param, value, dtype, device):
),
)

@pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
@pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
def test_kernel_bounding_boxes(self, format):
bounding_boxes = make_bounding_boxes(format=format)
check_kernel(
Expand Down Expand Up @@ -4034,12 +4073,15 @@ def _reference_pad_bounding_boxes(self, bounding_boxes, *, padding):
height = bounding_boxes.canvas_size[0] + top + bottom
width = bounding_boxes.canvas_size[1] + left + right

return reference_affine_bounding_boxes_helper(
bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=(height, width)
helper = (
reference_affine_rotated_bounding_boxes_helper
if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
else reference_affine_bounding_boxes_helper
)
return helper(bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=(height, width))

@pytest.mark.parametrize("padding", CORRECTNESS_PADDINGS)
@pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
@pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
@pytest.mark.parametrize("dtype", [torch.int64, torch.float32])
@pytest.mark.parametrize("device", cpu_and_cuda())
@pytest.mark.parametrize("fn", [F.pad, transform_cls_to_functional(transforms.Pad)])
Expand All @@ -4049,7 +4091,7 @@ def test_bounding_boxes_correctness(self, padding, format, dtype, device, fn):
actual = fn(bounding_boxes, padding=padding)
expected = self._reference_pad_bounding_boxes(bounding_boxes, padding=padding)

assert_equal(actual, expected)
torch.testing.assert_close(actual, expected)


class TestCenterCrop:
Expand All @@ -4068,7 +4110,7 @@ def test_kernel_image(self, output_size, dtype, device):
)

@pytest.mark.parametrize("output_size", OUTPUT_SIZES)
@pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
@pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
def test_kernel_bounding_boxes(self, output_size, format):
bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format)
check_kernel(
Expand Down Expand Up @@ -4142,12 +4184,15 @@ def _reference_center_crop_bounding_boxes(self, bounding_boxes, output_size):
[0, 1, -top],
],
)
return reference_affine_bounding_boxes_helper(
bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=output_size
helper = (
reference_affine_rotated_bounding_boxes_helper
if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
else reference_affine_bounding_boxes_helper
)
return helper(bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=output_size)

@pytest.mark.parametrize("output_size", OUTPUT_SIZES)
@pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
@pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
@pytest.mark.parametrize("dtype", [torch.int64, torch.float32])
@pytest.mark.parametrize("device", cpu_and_cuda())
@pytest.mark.parametrize("fn", [F.center_crop, transform_cls_to_functional(transforms.CenterCrop)])
Expand All @@ -4157,7 +4202,7 @@ def test_bounding_boxes_correctness(self, output_size, format, dtype, device, fn
actual = fn(bounding_boxes, output_size)
expected = self._reference_center_crop_bounding_boxes(bounding_boxes, output_size)

assert_equal(actual, expected)
torch.testing.assert_close(actual, expected)


class TestPerspective:
Expand Down Expand Up @@ -5894,6 +5939,37 @@ def test_classification_preset(image_type, label_type, dataset_return_type, to_t
assert out_label == label


@pytest.mark.parametrize("input_size", [(17, 11), (11, 17), (11, 11)])
@pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
@pytest.mark.parametrize("device", cpu_and_cuda())
def test_parallelogram_to_bounding_boxes(input_size, dtype, device):
# Assert that applying `_parallelogram_to_bounding_boxes` to rotated boxes
# does not modify the input.
bounding_boxes = make_bounding_boxes(
input_size, format=tv_tensors.BoundingBoxFormat.XYXYXYXY, dtype=dtype, device=device
)
actual = _parallelogram_to_bounding_boxes(bounding_boxes)
torch.testing.assert_close(actual, bounding_boxes, rtol=0, atol=1)

# Test the transformation of two simple parallelograms.
# 1---2 1----2
# / / -> | |
# 4---3 4----3

# 1---2 1----2
# \ \ -> | |
# 4---3 4----3
parallelogram = torch.tensor([[1, 0, 4, 0, 3, 2, 0, 2], [0, 0, 3, 0, 4, 2, 1, 2]])
expected = torch.tensor(
[
[0, 0, 4, 0, 4, 2, 0, 2],
[0, 0, 4, 0, 4, 2, 0, 2],
]
)
actual = _parallelogram_to_bounding_boxes(parallelogram)
assert_equal(actual, expected)


@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, tv_tensors.Image))
@pytest.mark.parametrize("data_augmentation", ("hflip", "lsj", "multiscale", "ssd", "ssdlite"))
@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImage))
Expand Down
Loading