Unified inputs for grayscale op and transforms

vfdev-5 · vfdev-5 · commit 50da7be70743 · 2020-08-14T10:04:42.000+02:00
- deprecated F.to_grayscale in favor of F.rgb_to_grayscale
diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py
@@ -1,5 +1,4 @@
 import unittest
-import random
 import colorsys
 import math
 
@@ -23,7 +22,10 @@ def _create_data(self, height=3, width=3, channels=3):
         return tensor, pil_img
 
     def compareTensorToPIL(self, tensor, pil_image, msg=None):
-        pil_tensor = torch.as_tensor(np.array(pil_image).transpose((2, 0, 1)))
+        np_pil_image = np.array(pil_image)
+        if np_pil_image.ndim == 2:
+            np_pil_image = np_pil_image[:, :, None]
+        pil_tensor = torch.as_tensor(np_pil_image.transpose((2, 0, 1)))
         if msg is None:
             msg = "tensor:\n{} \ndid not equal PIL tensor:\n{}".format(tensor, pil_tensor)
         self.assertTrue(tensor.equal(pil_tensor), msg)
@@ -187,17 +189,21 @@ def test_adjustments(self):
         scripted_fn(img)
 
     def test_rgb_to_grayscale(self):
-        script_rgb_to_grayscale = torch.jit.script(F_t.rgb_to_grayscale)
-        img_tensor = torch.randint(0, 255, (3, 16, 16), dtype=torch.uint8)
-        img_tensor_clone = img_tensor.clone()
-        grayscale_tensor = F_t.rgb_to_grayscale(img_tensor).to(int)
-        grayscale_pil_img = torch.tensor(np.array(F.to_grayscale(F.to_pil_image(img_tensor)))).to(int)
-        max_diff = (grayscale_tensor - grayscale_pil_img).abs().max()
-        self.assertLess(max_diff, 1.0001)
-        self.assertTrue(torch.equal(img_tensor, img_tensor_clone))
-        # scriptable function test
-        grayscale_script = script_rgb_to_grayscale(img_tensor).to(int)
-        self.assertTrue(torch.equal(grayscale_script, grayscale_tensor))
+        script_rgb_to_grayscale = torch.jit.script(F.rgb_to_grayscale)
+
+        img_tensor, pil_img = self._create_data(32, 34)
+
+        for num_output_channels in (3, 1):
+            gray_pil_image = F.rgb_to_grayscale(pil_img, num_output_channels=num_output_channels)
+            gray_tensor = F.rgb_to_grayscale(img_tensor, num_output_channels=num_output_channels)
+
+            if num_output_channels == 1:
+                print(gray_tensor.shape)
+
+            self.compareTensorToPIL(gray_tensor, gray_pil_image)
+
+            s_gray_tensor = script_rgb_to_grayscale(img_tensor, num_output_channels=num_output_channels)
+            self.assertTrue(s_gray_tensor.equal(gray_tensor))
 
     def test_center_crop(self):
         script_center_crop = torch.jit.script(F.center_crop)
diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py
@@ -324,10 +324,10 @@ def test_random_perspective(self):
     def test_to_grayscale(self):
 
         fn_kwargs = meth_kwargs = {"num_output_channels": 1}
-        self._test_op("to_grayscale", "Grayscale", fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs)
+        self._test_op("rgb_to_grayscale", "Grayscale", fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs)
 
         fn_kwargs = meth_kwargs = {"num_output_channels": 3}
-        self._test_op("to_grayscale", "Grayscale", fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs)
+        self._test_op("rgb_to_grayscale", "Grayscale", fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs)
 
         meth_kwargs = {}
         self._test_class_op("RandomGrayscale", meth_kwargs=meth_kwargs)
diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
@@ -959,12 +959,39 @@ def affine(
 
 
 def to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor:
-    """Convert image to grayscale version of image.
+    """DEPRECATED. Convert RGB image to grayscale version of image.
+    The image can be a PIL Image or a Tensor, in which case it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    .. warning::
+
+        This method is deprecated and will be removed in future releases.
+        Please, use ``F.rgb_to_grayscale`` instead.
+
+
+    Args:
+        img (PIL Image or Tensor): RGB Image to be converted to grayscale.
+        num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1.
+
+    Returns:
+        PIL Image or Tensor: Grayscale version of the image.
+            if num_output_channels = 1 : returned image is single channel
+
+            if num_output_channels = 3 : returned image is 3 channel with r = g = b
+    """
+    warnings.warn("The use of the F.to_grayscale transform is deprecated, " +
+                  "please use F.rgb_to_grayscale instead.")
+
+    return rgb_to_grayscale(img, num_output_channels)
+
+
+def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor:
+    """Convert RGB image to grayscale version of image.
     The image can be a PIL Image or a Tensor, in which case it is expected
     to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
 
     Args:
-        img (PIL Image or Tensor): Image to be converted to grayscale.
+        img (PIL Image or Tensor): RGB Image to be converted to grayscale.
         num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1.
 
     Returns:
@@ -974,9 +1001,9 @@ def to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor:
             if num_output_channels = 3 : returned image is 3 channel with r = g = b
     """
     if not isinstance(img, torch.Tensor):
-        return F_pil.to_grayscale(img, num_output_channels)
+        return F_pil.rgb_to_grayscale(img, num_output_channels)
 
-    return F_t.to_grayscale(img, num_output_channels)
+    return F_t.rgb_to_grayscale(img, num_output_channels)
 
 
 def erase(img: Tensor, i: int, j: int, h: int, w: int, v: Tensor, inplace: bool = False) -> Tensor:
diff --git a/torchvision/transforms/functional_pil.py b/torchvision/transforms/functional_pil.py
@@ -1,4 +1,5 @@
 import numbers
+import warnings
 from typing import Any, List, Sequence
 
 import numpy as np
@@ -491,12 +492,31 @@ def perspective(img, perspective_coeffs, interpolation=Image.BICUBIC, fill=None)
 
 @torch.jit.unused
 def to_grayscale(img, num_output_channels):
-    """Convert image to grayscale version of image.
+    """DEPRECATED. Convert RGB image to grayscale version of image.
 
     Args:
         img (PIL Image): Image to be converted to grayscale.
         num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1.
 
+    Returns:
+        PIL Image: Grayscale version of the image.
+            if num_output_channels = 1 : returned image is single channel
+
+            if num_output_channels = 3 : returned image is 3 channel with r = g = b
+    """
+    warnings.warn("The use of the F_pil.to_grayscale transform is deprecated, " +
+                  "please use F.rgb_to_grayscale instead.")
+    return rgb_to_grayscale(img, num_output_channels)
+
+
+@torch.jit.unused
+def rgb_to_grayscale(img, num_output_channels):
+    """Convert RGB image to grayscale version of image.
+
+    Args:
+        img (PIL Image): RGB Image to be converted to grayscale.
+        num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1.
+
     Returns:
         PIL Image: Grayscale version of the image.
             if num_output_channels = 1 : returned image is single channel
diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py
@@ -76,22 +76,47 @@ def crop(img: Tensor, top: int, left: int, height: int, width: int) -> Tensor:
     return img[..., top:top + height, left:left + width]
 
 
-def rgb_to_grayscale(img: Tensor) -> Tensor:
+def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor:
     """Convert the given RGB Image Tensor to Grayscale.
     For RGB to Grayscale conversion, ITU-R 601-2 luma transform is performed which
     is L = R * 0.2989 + G * 0.5870 + B * 0.1140
 
     Args:
         img (Tensor): Image to be converted to Grayscale in the form [C, H, W].
+        num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1.
 
     Returns:
-        Tensor: Grayscale image.
+        Tensor: Grayscale version of the image.
+            if num_output_channels = 1 : returned image is single channel
+
+            if num_output_channels = 3 : returned image is 3 channel with r = g = b
 
     """
-    if img.shape[0] != 3:
-        raise TypeError('Input Image does not contain 3 Channels')
+    if img.ndim < 3:
+        raise TypeError("Input image tensor should have at least 3 dimensions, but found {}".format(img.ndim))
+    c = img.shape[-3]
+    if c != 3:
+        raise TypeError("Input image tensor should 3 channels, but found {}".format(c))
 
-    return (0.2989 * img[0] + 0.5870 * img[1] + 0.1140 * img[2]).to(img.dtype)
+    if num_output_channels not in (1, 3):
+        raise ValueError('num_output_channels should be either 1 or 3')
+
+    r = img[..., 0, :, :].float()
+    g = img[..., 1, :, :].float()
+    b = img[..., 2, :, :].float()
+    # According to PIL docs: PIL grayscale L mode is L = R * 299/1000 + G * 587/1000 + B * 114/1000
+    # but implementation is slightly different:
+    # https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/
+    # src/libImaging/Convert.c#L47
+    # ((rgb)[0]*19595 + (rgb)[1]*38470 + (rgb)[2]*7471 + 0x8000) >> 16
+    l_img = torch.floor((19595 * r + 38470 * g + 7471 * b + 2 ** 15) / 2 ** 16).to(img.dtype)
+
+    if num_output_channels == 3:
+        l_img = torch.stack([l_img, l_img, l_img], dim=-3)
+    else:
+        l_img = l_img.unsqueeze(dim=-3)
+
+    return l_img
 
 
 def adjust_brightness(img: Tensor, brightness_factor: float) -> Tensor:
@@ -893,39 +918,3 @@ def perspective(
     mode = _interpolation_modes[interpolation]
 
     return _apply_grid_transform(img, grid, mode)
-
-
-def to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor:
-    """Convert image to grayscale version of image.
-
-    Args:
-        img (Tensor): Image to be converted to grayscale. We assume (..., 3, H, W) layout.
-        num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1.
-
-    Returns:
-        Tensor: Grayscale version of the image.
-            if num_output_channels = 1 : returned image is single channel
-
-            if num_output_channels = 3 : returned image is 3 channel with r = g = b
-    """
-    if img.ndim < 3:
-        raise TypeError("Input image tensor should have at least 3 dimensions, but found {}".format(img.ndim))
-    c = img.shape[-3]
-    if c != 3:
-        raise TypeError("Input image tensor should 3 channels, but found {}".format(c))
-
-    if num_output_channels not in (1, 3):
-        raise ValueError('num_output_channels should be either 1 or 3')
-
-    # PIL grayscale L mode is L = R * 299/1000 + G * 587/1000 + B * 114/1000
-    r = img[..., 0, :, :]
-    g = img[..., 1, :, :]
-    b = img[..., 2, :, :]
-    l_img = (0.299 * r + 0.587 * g + 0.114 * b + 0.5).to(img.dtype)
-
-    if num_output_channels == 3:
-        l_img = torch.stack([l_img, l_img, l_img], dim=-3)
-    else:
-        l_img = l_img.unsqueeze(dim=-3)
-
-    return l_img
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
@@ -1382,7 +1382,7 @@ def forward(self, img: Tensor) -> Tensor:
         Returns:
             PIL Image or Tensor: Grayscaled image.
         """
-        return F.to_grayscale(img, num_output_channels=self.num_output_channels)
+        return F.rgb_to_grayscale(img, num_output_channels=self.num_output_channels)
 
     def __repr__(self):
         return self.__class__.__name__ + '(num_output_channels={0})'.format(self.num_output_channels)
@@ -1419,7 +1419,7 @@ def forward(self, img: Tensor) -> Tensor:
         """
         num_output_channels = F._get_image_num_channels(img)
         if torch.rand(1) < self.p:
-            return F.to_grayscale(img, num_output_channels=num_output_channels)
+            return F.rgb_to_grayscale(img, num_output_channels=num_output_channels)
         return img
 
     def __repr__(self):