@@ -36,36 +36,36 @@ AllToAllInterNode::AllToAllInterNode(
36
36
numTokensPerDP = mallocZeroBuffer<uint32_t >(numLocalExperts * numDPGroups);
37
37
38
38
numTokensBuffer = (uint64_t *)nvshmem_malloc (sizeof (uint64_t ) * numLocalExperts * numDPGroups);
39
- ROSE_ASSERT (numTokensBuffer != nullptr , " failed to allocate numTokensBuffer" );
39
+ PPLX_ASSERT (numTokensBuffer != nullptr , " failed to allocate numTokensBuffer" );
40
40
cudaMemset (numTokensBuffer, 0 , sizeof (uint64_t ) * numLocalExperts * numDPGroups);
41
41
42
42
numDispatchRecvBuffer =
43
43
(uint64_t *)nvshmem_malloc (sizeof (uint64_t ) * numLocalExperts * numDPGroups);
44
- ROSE_ASSERT (numDispatchRecvBuffer != nullptr , " failed to allocate numDispatchRecvBuffer" );
44
+ PPLX_ASSERT (numDispatchRecvBuffer != nullptr , " failed to allocate numDispatchRecvBuffer" );
45
45
cudaMemset (numDispatchRecvBuffer, 0 , sizeof (uint64_t ) * numLocalExperts * numDPGroups);
46
46
47
47
combineSignalBuffer = (uint64_t *)nvshmem_malloc (sizeof (uint64_t ) * maxNumTokens);
48
- ROSE_ASSERT (combineSignalBuffer != nullptr , " failed to allocate combineSignalBuffer" );
48
+ PPLX_ASSERT (combineSignalBuffer != nullptr , " failed to allocate combineSignalBuffer" );
49
49
cudaMemset (combineSignalBuffer, 0 , sizeof (uint64_t ) * maxNumTokens);
50
50
51
51
combineSyncBuffer = (uint64_t *)nvshmem_malloc (sizeof (uint64_t ) * worldSize);
52
- ROSE_ASSERT (combineSyncBuffer != nullptr , " failed to allocate combineSyncBuffer" );
52
+ PPLX_ASSERT (combineSyncBuffer != nullptr , " failed to allocate combineSyncBuffer" );
53
53
cudaMemset (combineSyncBuffer, 0 , sizeof (uint64_t ) * worldSize);
54
54
55
55
// Buffers for dispatch.
56
56
const size_t perTokenBytes =
57
57
round_up<size_t >(hiddenDimBytes + hiddenDimScaleBytes + sizeof (uint32_t ), 16 );
58
58
xDispatchIn = (std::byte *)nvshmem_malloc (maxNumTokens * perTokenBytes);
59
- ROSE_ASSERT (xDispatchIn != nullptr , " failed to allocate xDispatchIn" );
59
+ PPLX_ASSERT (xDispatchIn != nullptr , " failed to allocate xDispatchIn" );
60
60
xDispatchOut = (std::byte *)nvshmem_malloc (maxBatchTokens * perTokenBytes);
61
- ROSE_ASSERT (xDispatchOut != nullptr , " failed to allocate xDispatchOut" );
61
+ PPLX_ASSERT (xDispatchOut != nullptr , " failed to allocate xDispatchOut" );
62
62
63
63
// Buffers for combine. The allocations are a bit wider to accommodate all
64
64
// possible data types (primarily float for testing and bfloat16 for prod).
65
65
xCombineIn = (std::byte *)nvshmem_malloc (maxBatchTokens * hiddenDim * sizeof (float ));
66
- ROSE_ASSERT (xCombineIn != nullptr , " failed to allocate xCombineIn" );
66
+ PPLX_ASSERT (xCombineIn != nullptr , " failed to allocate xCombineIn" );
67
67
xCombineOut = (std::byte *)nvshmem_malloc (maxNumTokens * numExperts * hiddenDim * sizeof (float ));
68
- ROSE_ASSERT (xCombineOut != nullptr , " failed to allocate xCombineOut" );
68
+ PPLX_ASSERT (xCombineOut != nullptr , " failed to allocate xCombineOut" );
69
69
70
70
// Buffers for token tracking.
71
71
sourceIndex = mallocZeroBuffer<uint32_t >(maxBatchTokens);
0 commit comments