Resize relies on interpolate's native uint8 handling

vfdev-5 · vfdev-5 · commit 1744362b9eb2 · 2023-05-09T13:08:17.000+02:00
Description: - Now that pytorch/pytorch#90771 is merged, let Resize() rely on interpolate()'s native uint8 handling instead of converting to and from float. - uint8 input is not casted to f32 for nearest mode and bilinear mode if the latter has AVX2. Context: pytorch#7217 Benchmarks: ``` [----------- Resize cpu torch.uint8 InterpolationMode.NEAREST -----------] | resize v2 | resize stable | resize nightly 1 threads: --------------------------------------------------------------- (3, 400, 400) | 457 | 461 | 480 (16, 3, 400, 400) | 6870 | 6850 | 10100 Times are in microseconds (us). [---------- Resize cpu torch.uint8 InterpolationMode.BILINEAR -----------] | resize v2 | resize stable | resize nightly 1 threads: --------------------------------------------------------------- (3, 400, 400) | 326 | 329 | 844 (16, 3, 400, 400) | 4380 | 4390 | 14800 Times are in microseconds (us). ``` [Source](https://gist.github.com/vfdev-5/a2e30ed50b5996807c9b09d5d33d8bc2)
diff --git a/torchvision/transforms/_functional_tensor.py b/torchvision/transforms/_functional_tensor.py
@@ -459,7 +459,17 @@ def resize(
         # now we don't as True is the default.
         antialias = False
 
-    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [torch.float32, torch.float64])
+    acceptable_dtypes = [torch.float32, torch.float64]
+    if interpolation in ["nearest", "nearest-exact"]:
+        # uint8 dtype can be included for cpu and cuda input if nearest mode
+        acceptable_dtypes.append(torch.uint8)
+    elif interpolation == "bilinear" and img.device.type == "cpu":
+        # uint8 dtype support for bilinear mode is limited to cpu and
+        # according to our benchmarks non-AVX CPUs should prefer u8->f32->interpolate->u8 path
+        if "AVX2" in torch.backends.cpu.get_cpu_capability():
+            acceptable_dtypes.append(torch.uint8)
+
+    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, acceptable_dtypes)
 
     # Define align_corners to avoid warnings
     align_corners = False if interpolation in ["bilinear", "bicubic"] else None
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
@@ -185,7 +185,17 @@ def resize_image_tensor(
         image = image.reshape(-1, num_channels, old_height, old_width)
 
         dtype = image.dtype
-        need_cast = dtype not in (torch.float32, torch.float64)
+        acceptable_dtypes = [torch.float32, torch.float64]
+        if interpolation in [InterpolationMode.NEAREST, InterpolationMode.NEAREST_EXACT]:
+            # uint8 dtype can be included for cpu and cuda input if nearest mode
+            acceptable_dtypes.append(torch.uint8)
+        elif interpolation == InterpolationMode.BILINEAR and image.device.type == "cpu":
+            # uint8 dtype support for bilinear mode is limited to cpu and
+            # according to our benchmarks non-AVX CPUs should prefer u8->f32->interpolate->u8 path
+            if "AVX2" in torch.backends.cpu.get_cpu_capability():
+                acceptable_dtypes.append(torch.uint8)
+
+        need_cast = dtype not in acceptable_dtypes
         if need_cast:
             image = image.to(dtype=torch.float32)