fix ruff mypy and clang formatting errors

Willy-Chan · Willy-Chan · commit 6c4a9b9bc19b · 2025-08-06T16:53:10.000-07:00
diff --git a/csrc/bindings/all_to_all_ops.cpp b/csrc/bindings/all_to_all_ops.cpp
@@ -74,7 +74,7 @@ fptr_t create_internode(
       hiddenDimScaleBytes
   );
 
-  // Needed to use host-side initialization information in device APIs. 
+  // Needed to use host-side initialization information in device APIs.
   nvshmem_init();
 
   return (fptr_t)ptr;
diff --git a/csrc/bindings/bindings.cpp b/csrc/bindings/bindings.cpp
@@ -5,7 +5,8 @@
 
 using namespace pplx;
 
-TORCH_LIBRARY(pplx_kernels, m) {
+TORCH_LIBRARY(pplx_kernels, m)
+{
   register_all_to_all_ops(m);
 }
 
diff --git a/src/pplx_kernels/__init__.py b/src/pplx_kernels/__init__.py
@@ -1,8 +1,6 @@
 from . import ops as ops
-from .all_to_all import (
-    AllToAll as AllToAll,
-)
+from .all_to_all import AllToAll as AllToAll
 from .nvshmem import (
-    nvshmem_init as nvshmem_init,
     PyTorchStreamWrapper as PyTorchStreamWrapper,
+    nvshmem_init as nvshmem_init,
 )
diff --git a/src/pplx_kernels/nvshmem.py b/src/pplx_kernels/nvshmem.py
@@ -1,14 +1,13 @@
 # pyright: reportCallIssue=false
 
-from collections.abc import Sequence
+from typing import Any, Optional
 
-import torch
+import nvshmem.core as nvshmem  # type: ignore[import]
 import torch.distributed as dist
 
-import nvshmem.core as nvshmem
 
 ###### NVSHMEM ######
-def nvshmem_init(global_rank: int, local_rank: int, world_size: int, device, uid=None) -> None:
+def nvshmem_init(global_rank: int, local_rank: int, world_size: int, device: Any, uid: Optional[Any] = None) -> None:
     uniqueid = nvshmem.get_unique_id(empty=True)
     if local_rank == 0:
         uniqueid = nvshmem.get_unique_id()
@@ -20,16 +19,16 @@ def nvshmem_init(global_rank: int, local_rank: int, world_size: int, device, uid
     dist.barrier()
 
     nvshmem.init(device=device, uid=broadcast_objects[0], rank=global_rank, nranks=world_size, initializer_method="uid")
-    
+
 
 # This stream wrapper returns the format required by CUDA Python. This workaround will be removed when nvshmem4py supports Torch stream interoperability.
 # For more information see: https://nvidia.github.io/cuda-python/cuda-core/latest/interoperability.html#cuda-stream-protocol
 class PyTorchStreamWrapper:
-    def __init__(self, pt_stream):
+    def __init__(self, pt_stream: Any) -> None:
         self.pt_stream = pt_stream
         self.handle = pt_stream.cuda_stream
 
-    def __cuda_stream__(self):
+    def __cuda_stream__(self) -> tuple[int, int]:
         stream_id = self.pt_stream.cuda_stream
         return (0, stream_id)
 
diff --git a/src/pplx_kernels/ops.py b/src/pplx_kernels/ops.py
@@ -5,6 +5,8 @@
 
 import torch
 
+logger = logging.getLogger(__name__)
+
 try:
     _lib_path = os.path.join(os.path.dirname(__file__), "libpplx_kernels.so")
     torch.ops.load_library(_lib_path)
@@ -13,4 +15,4 @@
     from types import SimpleNamespace
 
     _ops = SimpleNamespace()
-    logging.exception("Error loading pplx-kernels")
+    logger.exception("Error loading pplx-kernels")
diff --git a/tests/bench_all_to_all.py b/tests/bench_all_to_all.py
@@ -6,14 +6,13 @@
 from datetime import datetime
 from pathlib import Path
 
+import nvshmem.core as nvshmem  # type: ignore[import]
 import torch
-import torch.distributed as dist
-from cuda.core.experimental import Device
-import nvshmem.core as nvshmem
-from nvshmem.core import Teams
+from cuda.core.experimental import Device  # type: ignore[import]
+from nvshmem.core import Teams  # type: ignore[import]
 
+from pplx_kernels import PyTorchStreamWrapper, nvshmem_init
 from pplx_kernels.all_to_all import AllToAll
-from pplx_kernels import nvshmem_init, PyTorchStreamWrapper
 
 from .all_to_all_utils import MoEConfig, RankTestData
 from .distributed_utils import (
@@ -225,7 +224,7 @@ def run() -> tuple[float, ...]:
 
     # Cleanup
     ata.destroy()
-    
+
     nvshmem.free_tensor(nvshmem_in)
     nvshmem.free_tensor(nvshmem_out)
 
diff --git a/tests/test_all_to_all.py b/tests/test_all_to_all.py
@@ -1,13 +1,14 @@
 import dataclasses
 import logging
 
+import nvshmem.core as nvshmem  # type: ignore[import]
 import pytest
 import torch
 import torch.distributed as dist
-from cuda.core.experimental import Device
-import nvshmem.core as nvshmem
+from cuda.core.experimental import Device  # type: ignore[import]
+
+from pplx_kernels import nvshmem_init
 from pplx_kernels.all_to_all import AllToAll
-from pplx_kernels import nvshmem_init, PyTorchStreamWrapper
 
 from .all_to_all_utils import MoEConfig, RankTestData
 from .distributed_utils import (
@@ -299,7 +300,7 @@ def _worker_test_all_to_all(
     dev = Device(local_rank)
     dev.set_current()
 
-    stream = PyTorchStreamWrapper(torch.cuda.current_stream())
+
 
     nvshmem_init(global_rank=global_rank, local_rank=local_rank, world_size=num_ranks, device=dev)
 
@@ -316,7 +317,7 @@ def _worker_test_all_to_all(
             test_script_init_status, global_rank, local_rank
         )
 
-    _do_test_all_to_all(pgi, dp_size, moe_config, internode, stream)
+    _do_test_all_to_all(pgi, dp_size, moe_config, internode, use_compile)
 
     nvshmem.finalize()
 
diff --git a/tests/test_nvshmem.py b/tests/test_nvshmem.py
@@ -1,5 +1,13 @@
+import logging
+
+import nvshmem.core as nvshmem  # type: ignore[import]
 import pytest
 import torch
+import torch.distributed as dist
+from cuda.core.experimental import Device  # type: ignore[import]
+from nvshmem.core import Teams  # type: ignore[import]
+
+from pplx_kernels import nvshmem_init
 
 from .distributed_utils import (
     ProcessGroupInfo,
@@ -8,19 +16,14 @@
     require_multi_node,
 )
 
-from cuda.core.experimental import Device
-import nvshmem.core as nvshmem
-import torch.distributed as dist
-from nvshmem.core import Teams
-from pplx_kernels import nvshmem_init, PyTorchStreamWrapper
+logger = logging.getLogger(__name__)
 
 def test_nvshmem_1_gpu() -> None:
 
     local_rank = 0
-    world_size = 1
+    rank_id = 0  # Define rank_id for single GPU test
 
     torch.cuda.set_device(local_rank)
-    device = torch.device("cuda", local_rank)
     dev = Device(local_rank)
     dev.set_current()
 
@@ -39,17 +42,15 @@ def test_nvshmem_1_gpu() -> None:
     assert nvshmem.n_pes() == 1
 
     nvshmem.finalize()
-    
 
 
 def _worker_test_nvshmem_4_gpu(pgi: ProcessGroupInfo) -> None:
     local_rank = dist.get_rank()
-    world_size = dist.get_world_size()
 
     dev = Device(local_rank)
     dev.set_current()
 
-    nvshmem_init(global_rank=pgi.rank, local_rank=local_rank, world_size=world_size, device=dev)
+    nvshmem_init(global_rank=pgi.rank, local_rank=local_rank, world_size=pgi.world_size, device=dev)
 
     # Check host initialization status
     test_script_init_status = nvshmem.direct.init_status()
@@ -72,12 +73,10 @@ def test_nvshmem_4_gpu() -> None:
 
 def _worker_test_all_to_all(pgi: ProcessGroupInfo) -> None:
     local_rank = dist.get_rank()
-    world_size = dist.get_world_size()
 
     dev = Device(local_rank)
     dev.set_current()
-    stream = PyTorchStreamWrapper(torch.cuda.current_stream())
-    
+
     num_ranks = dist.get_world_size()
     rank_id = dist.get_rank()
 
@@ -98,9 +97,9 @@ def _worker_test_all_to_all(pgi: ProcessGroupInfo) -> None:
         t_out = nvshmem.tensor( (pgi.world_size,), dtype=torch.int32 )
 
         team = Teams.TEAM_WORLD
-        nvshmem.collective.alltoall(team, t_out, t_in, stream=stream)
+        nvshmem.collective.alltoall(team, t_out, t_in)
 
-        nvshmem.collective.barrier(team, stream=stream)
+        nvshmem.collective.barrier(team)
         torch.cuda.synchronize()
 
         assert t_out.tolist() == list(range(pgi.world_size))

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,8 @@`
`5`	`5`
`6`	`6`	`using namespace pplx;`
`7`	`7`
`8`		`-TORCH_LIBRARY(pplx_kernels, m) {`
	`8`	`+TORCH_LIBRARY(pplx_kernels, m)`
	`9`	`+{`
`9`	`10`	`register_all_to_all_ops(m);`
`10`	`11`	`}`
`11`	`12`