Skip to content

Commit 608be98

Browse files
committed
RDMA no longer needs torch
Pull Request resolved: #2045 This replaces the internal C++ APIs to torch with an interace for injecting the scanned segments. We then use a pyo3 function that indirectly calls the python torch.cuda.memory._snapshot function to get the same information. So we no longer need rdma to directly link torch. ghstack-source-id: 327462736 @exported-using-ghexport Differential Revision: [D88338646](https://our.internmc.facebook.com/intern/diff/D88338646/)
1 parent 43604d5 commit 608be98

File tree

21 files changed

+283
-488
lines changed

21 files changed

+283
-488
lines changed

.github/workflows/test-gpu-rust.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ jobs:
6868
--exclude monarch_tensor_worker \
6969
--exclude torch-sys \
7070
--exclude torch-sys-cuda
71+
--exclude monarch_rdma
7172
# Copy the test results to the expected location
7273
# TODO: error in pytest-results-action, TypeError: results.testsuites.testsuite.testcase is not iterable
7374
# Don't try to parse these results for now.

Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
resolver = "2"
33
members = [
44
"build_utils",
5-
"cuda-sys",
65
"erased_lifetime",
76
"hyper",
87
"hyperactor",

cuda-sys/Cargo.toml

Lines changed: 0 additions & 16 deletions
This file was deleted.

cuda-sys/build.rs

Lines changed: 0 additions & 101 deletions
This file was deleted.

cuda-sys/src/lib.rs

Lines changed: 0 additions & 36 deletions
This file was deleted.

cuda-sys/src/wrapper.h

Lines changed: 0 additions & 11 deletions
This file was deleted.

monarch_rdma/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ edition = "2024"
1313
[dependencies]
1414
anyhow = "1.0.98"
1515
async-trait = "0.1.86"
16-
cuda-sys = { path = "../cuda-sys" }
1716
futures = { version = "0.3.31", features = ["async-await", "compat"] }
1817
hyperactor = { version = "0.0.0", path = "../hyperactor" }
1918
rand = { version = "0.8", features = ["small_rng"] }

monarch_rdma/build.rs

Lines changed: 0 additions & 149 deletions
This file was deleted.

monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,7 @@ impl Handler<InitializeBuffer> for CudaRdmaActor {
407407
self.device_id as i32
408408
));
409409
cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context));
410-
cuda_sys::cudaDeviceSynchronize();
410+
rdmaxcel_sys::rdmaxcel_cuCtxSynchronize();
411411
cu_check!(rdmaxcel_sys::rdmaxcel_cuMemcpyHtoD_v2(
412412
self.cu_ptr as u64,
413413
self.cpu_buffer.as_ptr() as *const std::ffi::c_void,
@@ -554,7 +554,7 @@ impl Handler<VerifyBuffer> for CudaRdmaActor {
554554
self.device_id as i32
555555
));
556556
cu_check!(rdmaxcel_sys::rdmaxcel_cuCtxSetCurrent(context));
557-
cuda_sys::cudaDeviceSynchronize();
557+
rdmaxcel_sys::rdmaxcel_cuCtxSynchronize();
558558
cu_check!(rdmaxcel_sys::rdmaxcel_cuMemcpyDtoH_v2(
559559
self.cpu_buffer.as_mut_ptr() as *mut std::ffi::c_void,
560560
self.cu_ptr as rdmaxcel_sys::CUdeviceptr,

0 commit comments

Comments
 (0)