Add minimal nn.functional.log_softmax support for NestedTensor (pytorch#159662)

alanhdu · markc-614 · commit 72f1ef47ea82 · 2025-09-17T09:25:31.000+08:00
This only works for the jagged layout and for the non-batch and non-jagged dimensions. I did this mostly by copy-pasting from the existing softmax implementation, but it seems fairly straightforward and I think it should work. Pull Request resolved: pytorch#159662 Approved by: https://github.com/jbschlosser
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
@@ -4444,12 +4444,18 @@ def test_jagged_op_different_output_shape_dim(
     @dtypes(torch.float32)
     @parametrize("requires_grad", [False, True])
     @parametrize("components_require_grad", [False, True])
+    @parametrize(
+        "func",
+        [torch.nn.functional.softmax, torch.nn.functional.log_softmax],
+        name_fn=lambda func: func.__name__,
+    )
     def test_softmax_dim(
         self,
         device,
         dtype,
         requires_grad,
         components_require_grad,
+        func,
     ):
         """
         Softmax passes when reducing on valid reduction dimensions.
@@ -4468,7 +4474,7 @@ def test_softmax_dim(
 
         for reduce_dim, _ in reduce_dims:
             nt = torch.nested.as_nested_tensor(ts, layout=torch.jagged)
-            out_actual = torch.nn.functional.softmax(nt, dim=reduce_dim)
+            out_actual = func(nt, dim=reduce_dim)
             torch._dynamo.disable(self.assertEqual)(
                 len(out_actual.shape), len(output_shape)
             )  # disable if running on dynamo
@@ -4498,12 +4504,10 @@ def test_softmax_dim(
             reduce_dim, reduce_dim_expected = reduce_dim_tuple
 
             if nt.dim() > reduce_dim:
-                out_actual = torch.nn.functional.softmax(
-                    nt, dim=reduce_dim
-                )  # nested tensor
-                out_expected = torch.nn.functional.softmax(
-                    nt.values(), dim=reduce_dim_expected
-                )  # dense tensor of dimensions 1 less than out_actual
+                # nested tensor
+                out_actual = func(nt, dim=reduce_dim)
+                # dense tensor of dimensions 1 less than out_actual
+                out_expected = func(nt.values(), dim=reduce_dim_expected)
                 self.assertTrue(
                     torch.allclose(out_actual.values().view(-1), out_expected.view(-1))
                 )
@@ -4601,8 +4605,13 @@ def test_softmax_dim_reduce_ragged_idx_1(
     @dtypes(torch.float32)
     @parametrize("requires_grad", [False, True])
     @parametrize("components_require_grad", [False, True])
+    @parametrize(
+        "func",
+        [torch.nn.functional.softmax, torch.nn.functional.log_softmax],
+        name_fn=lambda func: func.__name__,
+    )
     def test_softmax_reduce_batch_dim(
-        self, device, dtype, requires_grad, components_require_grad
+        self, device, dtype, requires_grad, components_require_grad, func
     ):
         """
         Softmax on NestedTensor fails when trying to reduce across batch dimension.
@@ -4627,7 +4636,7 @@ def test_softmax_reduce_batch_dim(
                 RuntimeError,
                 "not supported when reducing across the batch dimension for NestedTensor",
             ):
-                out = torch.nn.functional.softmax(nt, dim=reduce_dim)
+                out = func(nt, dim=reduce_dim)
 
     @dtypes(torch.float32)
     @parametrize("requires_grad", [False, True])
diff --git a/torch/nested/_internal/ops.py b/torch/nested/_internal/ops.py
@@ -841,6 +841,46 @@ def _softmax_default(func, *args, **kwargs):
     return NestedTensor(func(inp._values, **new_kwargs), **extract_kwargs(inp))
 
 
+@register_jagged_func(
+    torch.ops.aten._log_softmax.default, "self: jt_all, dim: any, half_to_float: any"
+)
+def _log_softmax_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(  # type: ignore[misc]
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    if isinstance(new_kwargs["dim"], tuple):
+        raise RuntimeError(
+            "log_softmax(): not supported for dimensions of type 'tuple' for NestedTensor"
+        )
+
+    inp = new_kwargs.pop("input")
+
+    (
+        new_kwargs["dim"],
+        reduce_on_batch,
+        reduce_on_ragged,
+        _reduce_on_non_batch,
+    ) = _wrap_jagged_dims(
+        inp.dim(), (new_kwargs["dim"],), "log_softmax", inp._ragged_idx
+    )
+
+    if reduce_on_batch:
+        raise RuntimeError(
+            "log_softmax(): not supported when reducing across the batch dimension for NestedTensor"
+        )
+
+    if reduce_on_ragged:
+        raise RuntimeError(
+            "log_softmax(): not supported when reducing along the ragged dimension for NestedTensor"
+        )
+
+    # torch.log_softmax takes in the reduction dimension as an integer
+    new_kwargs["dim"] = new_kwargs["dim"][0]
+
+    return NestedTensor(func(inp._values, **new_kwargs), **extract_kwargs(inp))
+
+
 @register_jagged_func(
     torch.ops.aten._softmax_backward_data.default,
     "grad_output: jt, output: jt, dim: any, input_dtype: any",