Skip to content

Commit 602e8ca

Browse files
pmeiervfdev-5
andauthored
clamp bounding boxes in some geometry kernels (#7215)
Co-authored-by: vfdev-5 <[email protected]>
1 parent 6af6bf4 commit 602e8ca

File tree

8 files changed

+189
-89
lines changed

8 files changed

+189
-89
lines changed

test/prototype_transforms_kernel_infos.py

Lines changed: 93 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def float32_vs_uint8_pixel_difference(atol=1, mae=False):
108108
}
109109

110110

111-
def scripted_vs_eager_double_pixel_difference(device, atol=1e-6, rtol=1e-6):
111+
def scripted_vs_eager_float64_tolerances(device, atol=1e-6, rtol=1e-6):
112112
return {
113113
(("TestKernels", "test_scripted_vs_eager"), torch.float64, device): {"atol": atol, "rtol": rtol, "mae": False},
114114
}
@@ -211,10 +211,12 @@ def reference_horizontal_flip_bounding_box(bounding_box, *, format, spatial_size
211211
[-1, 0, spatial_size[1]],
212212
[0, 1, 0],
213213
],
214-
dtype="float32",
214+
dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
215215
)
216216

217-
expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
217+
expected_bboxes = reference_affine_bounding_box_helper(
218+
bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
219+
)
218220

219221
return expected_bboxes
220222

@@ -322,7 +324,7 @@ def reference_inputs_resize_image_tensor():
322324
def sample_inputs_resize_bounding_box():
323325
for bounding_box_loader in make_bounding_box_loaders():
324326
for size in _get_resize_sizes(bounding_box_loader.spatial_size):
325-
yield ArgsKwargs(bounding_box_loader, size=size, spatial_size=bounding_box_loader.spatial_size)
327+
yield ArgsKwargs(bounding_box_loader, spatial_size=bounding_box_loader.spatial_size, size=size)
326328

327329

328330
def sample_inputs_resize_mask():
@@ -344,19 +346,20 @@ def reference_resize_bounding_box(bounding_box, *, spatial_size, size, max_size=
344346
[new_width / old_width, 0, 0],
345347
[0, new_height / old_height, 0],
346348
],
347-
dtype="float32",
349+
dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
348350
)
349351

350352
expected_bboxes = reference_affine_bounding_box_helper(
351-
bounding_box, format=datapoints.BoundingBoxFormat.XYXY, affine_matrix=affine_matrix
353+
bounding_box,
354+
format=bounding_box.format,
355+
spatial_size=(new_height, new_width),
356+
affine_matrix=affine_matrix,
352357
)
353358
return expected_bboxes, (new_height, new_width)
354359

355360

356361
def reference_inputs_resize_bounding_box():
357-
for bounding_box_loader in make_bounding_box_loaders(
358-
formats=[datapoints.BoundingBoxFormat.XYXY], extra_dims=((), (4,))
359-
):
362+
for bounding_box_loader in make_bounding_box_loaders(extra_dims=((), (4,))):
360363
for size in _get_resize_sizes(bounding_box_loader.spatial_size):
361364
yield ArgsKwargs(bounding_box_loader, size=size, spatial_size=bounding_box_loader.spatial_size)
362365

@@ -543,14 +546,17 @@ def _compute_affine_matrix(angle, translate, scale, shear, center):
543546
return true_matrix
544547

545548

546-
def reference_affine_bounding_box_helper(bounding_box, *, format, affine_matrix):
547-
def transform(bbox, affine_matrix_, format_):
549+
def reference_affine_bounding_box_helper(bounding_box, *, format, spatial_size, affine_matrix):
550+
def transform(bbox, affine_matrix_, format_, spatial_size_):
548551
# Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
549552
in_dtype = bbox.dtype
550553
if not torch.is_floating_point(bbox):
551554
bbox = bbox.float()
552555
bbox_xyxy = F.convert_format_bounding_box(
553-
bbox, old_format=format_, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
556+
bbox.as_subclass(torch.Tensor),
557+
old_format=format_,
558+
new_format=datapoints.BoundingBoxFormat.XYXY,
559+
inplace=True,
554560
)
555561
points = np.array(
556562
[
@@ -573,12 +579,15 @@ def transform(bbox, affine_matrix_, format_):
573579
out_bbox = F.convert_format_bounding_box(
574580
out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_, inplace=True
575581
)
576-
return out_bbox.to(dtype=in_dtype)
582+
# It is important to clamp before casting, especially for CXCYWH format, dtype=int64
583+
out_bbox = F.clamp_bounding_box(out_bbox, format=format_, spatial_size=spatial_size_)
584+
out_bbox = out_bbox.to(dtype=in_dtype)
585+
return out_bbox
577586

578587
if bounding_box.ndim < 2:
579588
bounding_box = [bounding_box]
580589

581-
expected_bboxes = [transform(bbox, affine_matrix, format) for bbox in bounding_box]
590+
expected_bboxes = [transform(bbox, affine_matrix, format, spatial_size) for bbox in bounding_box]
582591
if len(expected_bboxes) > 1:
583592
expected_bboxes = torch.stack(expected_bboxes)
584593
else:
@@ -594,7 +603,9 @@ def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle,
594603
affine_matrix = _compute_affine_matrix(angle, translate, scale, shear, center)
595604
affine_matrix = affine_matrix[:2, :]
596605

597-
expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
606+
expected_bboxes = reference_affine_bounding_box_helper(
607+
bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
608+
)
598609

599610
return expected_bboxes
600611

@@ -643,9 +654,6 @@ def sample_inputs_affine_video():
643654
sample_inputs_fn=sample_inputs_affine_bounding_box,
644655
reference_fn=reference_affine_bounding_box,
645656
reference_inputs_fn=reference_inputs_affine_bounding_box,
646-
closeness_kwargs={
647-
(("TestKernels", "test_against_reference"), torch.int64, "cpu"): dict(atol=1, rtol=0),
648-
},
649657
test_marks=[
650658
xfail_jit_python_scalar_arg("shear"),
651659
],
@@ -729,10 +737,12 @@ def reference_vertical_flip_bounding_box(bounding_box, *, format, spatial_size):
729737
[1, 0, 0],
730738
[0, -1, spatial_size[0]],
731739
],
732-
dtype="float32",
740+
dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
733741
)
734742

735-
expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
743+
expected_bboxes = reference_affine_bounding_box_helper(
744+
bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
745+
)
736746

737747
return expected_bboxes
738748

@@ -806,6 +816,43 @@ def sample_inputs_rotate_bounding_box():
806816
)
807817

808818

819+
def reference_inputs_rotate_bounding_box():
820+
for bounding_box_loader, angle in itertools.product(
821+
make_bounding_box_loaders(extra_dims=((), (4,))), _ROTATE_ANGLES
822+
):
823+
yield ArgsKwargs(
824+
bounding_box_loader,
825+
format=bounding_box_loader.format,
826+
spatial_size=bounding_box_loader.spatial_size,
827+
angle=angle,
828+
)
829+
830+
# TODO: add samples with expand=True and center
831+
832+
833+
def reference_rotate_bounding_box(bounding_box, *, format, spatial_size, angle, expand=False, center=None):
834+
835+
if center is None:
836+
center = [spatial_size[1] * 0.5, spatial_size[0] * 0.5]
837+
838+
a = np.cos(angle * np.pi / 180.0)
839+
b = np.sin(angle * np.pi / 180.0)
840+
cx = center[0]
841+
cy = center[1]
842+
affine_matrix = np.array(
843+
[
844+
[a, b, cx - cx * a - b * cy],
845+
[-b, a, cy + cx * b - a * cy],
846+
],
847+
dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
848+
)
849+
850+
expected_bboxes = reference_affine_bounding_box_helper(
851+
bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
852+
)
853+
return expected_bboxes, spatial_size
854+
855+
809856
def sample_inputs_rotate_mask():
810857
for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
811858
yield ArgsKwargs(mask_loader, angle=15.0)
@@ -834,9 +881,11 @@ def sample_inputs_rotate_video():
834881
KernelInfo(
835882
F.rotate_bounding_box,
836883
sample_inputs_fn=sample_inputs_rotate_bounding_box,
884+
reference_fn=reference_rotate_bounding_box,
885+
reference_inputs_fn=reference_inputs_rotate_bounding_box,
837886
closeness_kwargs={
838-
**scripted_vs_eager_double_pixel_difference("cpu", atol=1e-5, rtol=1e-5),
839-
**scripted_vs_eager_double_pixel_difference("cuda", atol=1e-5, rtol=1e-5),
887+
**scripted_vs_eager_float64_tolerances("cpu", atol=1e-6, rtol=1e-6),
888+
**scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5),
840889
},
841890
),
842891
KernelInfo(
@@ -897,17 +946,19 @@ def sample_inputs_crop_video():
897946

898947

899948
def reference_crop_bounding_box(bounding_box, *, format, top, left, height, width):
900-
901949
affine_matrix = np.array(
902950
[
903951
[1, 0, -left],
904952
[0, 1, -top],
905953
],
906-
dtype="float32",
954+
dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
907955
)
908956

909-
expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
910-
return expected_bboxes, (height, width)
957+
spatial_size = (height, width)
958+
expected_bboxes = reference_affine_bounding_box_helper(
959+
bounding_box, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix
960+
)
961+
return expected_bboxes, spatial_size
911962

912963

913964
def reference_inputs_crop_bounding_box():
@@ -1119,13 +1170,15 @@ def reference_pad_bounding_box(bounding_box, *, format, spatial_size, padding, p
11191170
[1, 0, left],
11201171
[0, 1, top],
11211172
],
1122-
dtype="float32",
1173+
dtype="float64" if bounding_box.dtype == torch.float64 else "float32",
11231174
)
11241175

11251176
height = spatial_size[0] + top + bottom
11261177
width = spatial_size[1] + left + right
11271178

1128-
expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
1179+
expected_bboxes = reference_affine_bounding_box_helper(
1180+
bounding_box, format=format, spatial_size=(height, width), affine_matrix=affine_matrix
1181+
)
11291182
return expected_bboxes, (height, width)
11301183

11311184

@@ -1225,14 +1278,16 @@ def sample_inputs_perspective_bounding_box():
12251278
yield ArgsKwargs(
12261279
bounding_box_loader,
12271280
format=bounding_box_loader.format,
1281+
spatial_size=bounding_box_loader.spatial_size,
12281282
startpoints=None,
12291283
endpoints=None,
12301284
coefficients=_PERSPECTIVE_COEFFS[0],
12311285
)
12321286

12331287
format = datapoints.BoundingBoxFormat.XYXY
1288+
loader = make_bounding_box_loader(format=format)
12341289
yield ArgsKwargs(
1235-
make_bounding_box_loader(format=format), format=format, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS
1290+
loader, format=format, spatial_size=loader.spatial_size, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS
12361291
)
12371292

12381293

@@ -1269,13 +1324,17 @@ def sample_inputs_perspective_video():
12691324
**pil_reference_pixel_difference(2, mae=True),
12701325
**cuda_vs_cpu_pixel_difference(),
12711326
**float32_vs_uint8_pixel_difference(),
1272-
**scripted_vs_eager_double_pixel_difference("cpu", atol=1e-5, rtol=1e-5),
1273-
**scripted_vs_eager_double_pixel_difference("cuda", atol=1e-5, rtol=1e-5),
1327+
**scripted_vs_eager_float64_tolerances("cpu", atol=1e-5, rtol=1e-5),
1328+
**scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5),
12741329
},
12751330
),
12761331
KernelInfo(
12771332
F.perspective_bounding_box,
12781333
sample_inputs_fn=sample_inputs_perspective_bounding_box,
1334+
closeness_kwargs={
1335+
**scripted_vs_eager_float64_tolerances("cpu", atol=1e-6, rtol=1e-6),
1336+
**scripted_vs_eager_float64_tolerances("cuda", atol=1e-6, rtol=1e-6),
1337+
},
12791338
),
12801339
KernelInfo(
12811340
F.perspective_mask,
@@ -1292,8 +1351,8 @@ def sample_inputs_perspective_video():
12921351
sample_inputs_fn=sample_inputs_perspective_video,
12931352
closeness_kwargs={
12941353
**cuda_vs_cpu_pixel_difference(),
1295-
**scripted_vs_eager_double_pixel_difference("cpu", atol=1e-5, rtol=1e-5),
1296-
**scripted_vs_eager_double_pixel_difference("cuda", atol=1e-5, rtol=1e-5),
1354+
**scripted_vs_eager_float64_tolerances("cpu", atol=1e-5, rtol=1e-5),
1355+
**scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5),
12971356
},
12981357
),
12991358
]
@@ -1331,6 +1390,7 @@ def sample_inputs_elastic_bounding_box():
13311390
yield ArgsKwargs(
13321391
bounding_box_loader,
13331392
format=bounding_box_loader.format,
1393+
spatial_size=bounding_box_loader.spatial_size,
13341394
displacement=displacement,
13351395
)
13361396

test/test_prototype_transforms.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ class TestSmoke:
146146
(transforms.RandomZoomOut(p=1.0), None),
147147
(transforms.Resize([16, 16], antialias=True), None),
148148
(transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2)), None),
149-
(transforms.ClampBoundingBoxes(), None),
149+
(transforms.ClampBoundingBox(), None),
150150
(transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None),
151151
(transforms.ConvertDtype(), None),
152152
(transforms.GaussianBlur(kernel_size=3), None),

0 commit comments

Comments
 (0)