Unified input for F.perspective

vfdev-5 · vfdev-5 · commit 205051e35c48 · 2020-08-06T22:32:43.000+02:00
- added tests
- updated docs
diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py
@@ -546,33 +546,37 @@ def test_rotate(self):
                                 )
 
     def test_perspective(self):
-        tensor, pil_img = self._create_data(26, 34)
-
-        scripted_tranform = torch.jit.script(F.perspective)
-
-        test_configs = [
-            ([(0, 0), (33, 0), (33, 25), (0, 25)], [(3, 2), (32, 3), (30, 24), (2, 25)]),
-            ([(3, 2), (32, 3), (30, 24), (2, 25)], [(0, 0), (33, 0), (33, 25), (0, 25)]),
-            ([(3, 2), (32, 3), (30, 24), (2, 25)], [(5, 5), (30, 3), (33, 19), (4, 25)]),
-        ]
-        for r in [0, ]:
-            for spoints, epoints in test_configs:
-                out_pil_img = F.perspective(pil_img, startpoints=spoints, endpoints=epoints, interpolation=r)
-                out_pil_tensor = torch.from_numpy(np.array(out_pil_img).transpose((2, 0, 1)))
-
-                for fn in [F.perspective, scripted_tranform]:
-                    out_tensor = fn(tensor, startpoints=spoints, endpoints=epoints, interpolation=r)
-
-                    num_diff_pixels = (out_tensor != out_pil_tensor).sum().item() / 3.0
-                    ratio_diff_pixels = num_diff_pixels / out_tensor.shape[-1] / out_tensor.shape[-2]
-                    # Tolerance : less than 5% of different pixels
-                    self.assertLess(
-                        ratio_diff_pixels,
-                        0.05,
-                        msg="{}: {}\n{} vs \n{}".format(
-                            (r, spoints, epoints), ratio_diff_pixels, out_tensor[0, :7, :7], out_pil_tensor[0, :7, :7]
+
+        for tensor, pil_img in [self._create_data(26, 34), self._create_data(26, 26)]:
+
+            scripted_tranform = torch.jit.script(F.perspective)
+
+            test_configs = [
+                [[[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]],
+                [[[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]],
+                [[[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]],
+            ]
+            for r in [0, ]:
+                for spoints, epoints in test_configs:
+                    out_pil_img = F.perspective(pil_img, startpoints=spoints, endpoints=epoints, interpolation=r)
+                    out_pil_tensor = torch.from_numpy(np.array(out_pil_img).transpose((2, 0, 1)))
+
+                    for fn in [F.perspective, scripted_tranform]:
+                        out_tensor = fn(tensor, startpoints=spoints, endpoints=epoints, interpolation=r)
+
+                        num_diff_pixels = (out_tensor != out_pil_tensor).sum().item() / 3.0
+                        ratio_diff_pixels = num_diff_pixels / out_tensor.shape[-1] / out_tensor.shape[-2]
+                        # Tolerance : less than 3% of different pixels
+                        self.assertLess(
+                            ratio_diff_pixels,
+                            0.03,
+                            msg="{}: {}\n{} vs \n{}".format(
+                                (r, spoints, epoints),
+                                ratio_diff_pixels,
+                                out_tensor[0, :7, :7],
+                                out_pil_tensor[0, :7, :7]
+                            )
                         )
-                    )
 
 
 if __name__ == '__main__':
diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
@@ -492,38 +492,39 @@ def hflip(img: Tensor) -> Tensor:
 
 
 def _get_perspective_coeffs(
-        startpoints: List[Tuple[int, int]], endpoints: List[Tuple[int, int]]
+        startpoints: List[List[int]], endpoints: List[List[int]]
 ) -> List[float]:
     """Helper function to get the coefficients (a, b, c, d, e, f, g, h) for the perspective transforms.
 
     In Perspective Transform each pixel (x, y) in the original image gets transformed as,
      (x, y) -> ( (ax + by + c) / (gx + hy + 1), (dx + ey + f) / (gx + hy + 1) )
 
     Args:
-        startpoints (list of tuples): List containing four tuples of two integers corresponding to four corners
+        startpoints (list of list of ints): List containing four lists of two integers corresponding to four corners
             ``[top-left, top-right, bottom-right, bottom-left]`` of the original image.
-        endpoints (list of tuples): List containing four tuples of two integers corresponding to four corners
+        endpoints (list of list of ints): List containing four lists of two integers corresponding to four corners
             ``[top-left, top-right, bottom-right, bottom-left]`` of the transformed image.
 
     Returns:
         octuple (a, b, c, d, e, f, g, h) for transforming each pixel.
     """
-    matrix = []
+    a_matrix = torch.zeros(2 * len(startpoints), 8, dtype=torch.float)
 
-    for p1, p2 in zip(endpoints, startpoints):
-        matrix.append([p1[0], p1[1], 1, 0, 0, 0, -p2[0] * p1[0], -p2[0] * p1[1]])
-        matrix.append([0, 0, 0, p1[0], p1[1], 1, -p2[1] * p1[0], -p2[1] * p1[1]])
+    for i, (p1, p2) in enumerate(zip(endpoints, startpoints)):
+        a_matrix[2 * i, :] = torch.tensor([p1[0], p1[1], 1, 0, 0, 0, -p2[0] * p1[0], -p2[0] * p1[1]])
+        a_matrix[2 * i + 1, :] = torch.tensor([0, 0, 0, p1[0], p1[1], 1, -p2[1] * p1[0], -p2[1] * p1[1]])
 
-    A = torch.tensor(matrix, dtype=torch.float)
-    B = torch.tensor(startpoints, dtype=torch.float).view(8)
-    res = torch.lstsq(B, A)[0]
-    return res.squeeze_(1).tolist()
+    b_matrix = torch.tensor(startpoints, dtype=torch.float).view(8)
+    res = torch.lstsq(b_matrix, a_matrix)[0]
+    # We have to explicitly produce the list of floats, otherwise torch.jit.script does recognize output type
+    # RuntimeError: Expected type hint for result of tolist()
+    return [float(i.item()) for i in res[:, 0]]
 
 
 def perspective(
         img: Tensor,
-        startpoints: List[Tuple[int, int]],
-        endpoints: List[Tuple[int, int]],
+        startpoints: List[List[int]],
+        endpoints: List[List[int]],
         interpolation: int = 3,
         fill: Optional[int] = None
 ) -> Tensor:
@@ -533,9 +534,9 @@ def perspective(
 
     Args:
         img (PIL Image or Tensor): Image to be transformed.
-        startpoints (list of tuples): List containing four tuples of two integers corresponding to four corners
+        startpoints (list of list of ints): List containing four lists of two integers corresponding to four corners
             ``[top-left, top-right, bottom-right, bottom-left]`` of the original image.
-        endpoints (list of tuples): List containing four tuples of two integers corresponding to four corners
+        endpoints (list of list of ints): List containing four lists of two integers corresponding to four corners
             ``[top-left, top-right, bottom-right, bottom-left]`` of the transformed image.
         interpolation (int): Interpolation type. If input is Tensor, only ``PIL.Image.NEAREST`` and
             ``PIL.Image.BILINEAR`` are supported. Default, ``PIL.Image.BICUBIC`` for PIL images and
@@ -546,15 +547,20 @@ def perspective(
             input. Fill value for the area outside the transform in the output image is always 0.
 
     Returns:
-        PIL Image or Tensor: Perspectively transformed Image.
+        PIL Image or Tensor: transformed Image.
     """
 
     coeffs = _get_perspective_coeffs(startpoints, endpoints)
 
     if not isinstance(img, torch.Tensor):
         return F_pil.perspective(img, coeffs, interpolation=interpolation, fill=fill)
 
-    return F_t.perspective()
+    if interpolation == Image.BICUBIC:
+        # bicubic is not supported by pytorch
+        # set to bilinear interpolation
+        interpolation = 2
+
+    return F_t.perspective(img, coeffs, interpolation=interpolation, fill=fill)
 
 
 def vflip(img: Tensor) -> Tensor:
diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py
@@ -620,17 +620,25 @@ def resize(img: Tensor, size: List[int], interpolation: int = 2) -> Tensor:
 
 
 def _assert_grid_transform_inputs(
-        img: Tensor, matrix: List[float], resample: int, fillcolor: Optional[int], _interpolation_modes: Dict[int, str]
+        img: Tensor,
+        matrix: Optional[List[float]],
+        resample: int,
+        fillcolor: Optional[int],
+        _interpolation_modes: Dict[int, str],
+        coeffs: Optional[List[float]] = None,
 ):
     if not (isinstance(img, torch.Tensor) and _is_tensor_a_torch_image(img)):
         raise TypeError("img should be Tensor Image. Got {}".format(type(img)))
 
-    if not isinstance(matrix, list):
+    if matrix is not None and not isinstance(matrix, list):
         raise TypeError("Argument matrix should be a list. Got {}".format(type(matrix)))
 
-    if len(matrix) != 6:
+    if matrix is not None and len(matrix) != 6:
         raise ValueError("Argument matrix should have 6 float values")
 
+    if coeffs is not None and len(coeffs) != 8:
+        raise ValueError("Argument coeffs should have 8 float values")
+
     if fillcolor is not None:
         warnings.warn("Argument fill/fillcolor is not supported for Tensor input. Fill value is zero")
 
@@ -775,6 +783,37 @@ def rotate(
     return _apply_grid_transform(img, grid, mode)
 
 
+def _perspective_grid(coeffs: List[float], ow: int, oh: int):
+    # https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/
+    # src/libImaging/Geometry.c#L394
+
+    #
+    # x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    # y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    #
+
+    theta1 = torch.tensor([[
+        [coeffs[0], coeffs[1], coeffs[2]],
+        [coeffs[3], coeffs[4], coeffs[5]]
+    ]])
+    theta2 = torch.tensor([[
+        [coeffs[6], coeffs[7], 1.0],
+        [coeffs[6], coeffs[7], 1.0]
+    ]])
+
+    d = 0.5
+    base_grid = torch.empty(1, oh, ow, 3)
+    base_grid[..., 0].copy_(torch.linspace(d, ow * 1.0 + d - 1.0, steps=ow))
+    base_grid[..., 1].copy_(torch.linspace(d, oh * 1.0 + d - 1.0, steps=oh).unsqueeze_(-1))
+    base_grid[..., 2].fill_(1)
+
+    output_grid1 = base_grid.view(1, oh * ow, 3).bmm(theta1.transpose(1, 2) / torch.tensor([0.5 * ow, 0.5 * oh]))
+    output_grid2 = base_grid.view(1, oh * ow, 3).bmm(theta2.transpose(1, 2))
+
+    output_grid = output_grid1 / output_grid2 - 1.0
+    return output_grid.view(1, oh, ow, 2)
+
+
 def perspective(
         img: Tensor, perspective_coeffs: List[float], interpolation: int = 2, fill: Optional[int] = None
 ) -> Tensor:
@@ -783,14 +822,32 @@ def perspective(
     Args:
         img (Tensor): Image to be transformed.
         perspective_coeffs (list of float): perspective transformation coefficients.
-        interpolation (int): Interpolation type. Default, ``Image.BICUBIC``.
+        interpolation (int): Interpolation type. Default, ``PIL.Image.BILINEAR``.
         fill (n-tuple or int or float): this option is not supported for Tensor input. Fill value for the area
             outside the transform in the output image is always 0.
 
     Returns:
-        Tensor: Perspectively transformed Image.
+        Tensor: transformed image.
     """
     if not (isinstance(img, torch.Tensor) and _is_tensor_a_torch_image(img)):
         raise TypeError('img should be Tensor Image. Got {}'.format(type(img)))
 
-    return None
+    _interpolation_modes = {
+        0: "nearest",
+        2: "bilinear",
+    }
+
+    _assert_grid_transform_inputs(
+        img,
+        matrix=None,
+        resample=interpolation,
+        fillcolor=fill,
+        _interpolation_modes=_interpolation_modes,
+        coeffs=perspective_coeffs
+    )
+
+    ow, oh = img.shape[-1], img.shape[-2]
+    grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh)
+    mode = _interpolation_modes[interpolation]
+
+    return _apply_grid_transform(img, grid, mode)