-
Notifications
You must be signed in to change notification settings - Fork 734
Support xdc shuffle for RDMA #29400
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Support xdc shuffle for RDMA #29400
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -78,7 +78,6 @@ namespace NActors { | |||||
| Metrics->UpdateIcQueueTimeHistogram(duration.MicroSeconds()); | ||||||
| } | ||||||
| event.Span && event.Span.Event("FeedBuf:INITIAL"); | ||||||
| SendViaRdma.reset(); | ||||||
| if (event.Buffer) { | ||||||
| State = EState::BODY; | ||||||
| Iter = event.Buffer->GetBeginIter(); | ||||||
|
|
@@ -105,22 +104,15 @@ namespace NActors { | |||||
| } else if (Params.UseExternalDataChannel && !SerializationInfo->Sections.empty()) { | ||||||
| State = EState::SECTIONS; | ||||||
| SectionIndex = 0; | ||||||
| XXH3_64bits_reset(&RdmaChecksumState); | ||||||
|
|
||||||
| size_t totalSize = 0; | ||||||
| // It is possible to have event without payload. Such events has only one section. | ||||||
| // We do not send such events via rdma. | ||||||
| bool sendViaRdma = Params.UseRdma && RdmaMemPool && SerializationInfo->Sections.size() > 2; | ||||||
| // Check each section can be send via rdma | ||||||
| bool sendViaRdma = false; | ||||||
| // Check if any section can be send via rdma | ||||||
| for (const auto& section : SerializationInfo->Sections) { | ||||||
| sendViaRdma &= section.IsRdmaCapable; | ||||||
| totalSize += section.Size; | ||||||
| sendViaRdma |= section.IsRdmaCapable; | ||||||
| } | ||||||
| if (sendViaRdma) { | ||||||
| Y_ABORT_UNLESS(totalSize, "got empty sz, sections: %d type: %d ", SerializationInfo->Sections.size(), event.Event->Type()); | ||||||
| NActorsInterconnect::TRdmaCreds rdmaCreds; | ||||||
| ui32 checkSum = 0; | ||||||
| if (SerializeEventRdma(event, rdmaCreds, task.Params.ChecksumRdmaEvent ? &checkSum : nullptr, rdmaDeviceIndex)) { | ||||||
| SendViaRdma.emplace(TRdmaSerializationArtifacts{std::move(rdmaCreds), checkSum}); | ||||||
| if (sendViaRdma && Params.UseRdma && RdmaMemPool) { | ||||||
| if (SerializeEventRdma(event)) { | ||||||
| Chunker.DiscardEvent(); | ||||||
| } | ||||||
| } | ||||||
|
|
@@ -168,8 +160,7 @@ namespace NActors { | |||||
| p += NInterconnect::NDetail::SerializeNumber(section.Alignment, p); | ||||||
| if (section.IsInline && Params.UseXdcShuffle) { | ||||||
| type = static_cast<ui8>(EXdcCommand::DECLARE_SECTION_INLINE); | ||||||
| } | ||||||
| if (SendViaRdma) { | ||||||
| } else if (section.IsRdmaCapable) { | ||||||
| type = static_cast<ui8>(EXdcCommand::DECLARE_SECTION_RDMA); | ||||||
| } | ||||||
| Y_ABORT_UNLESS(p <= std::end(sectionInfo)); | ||||||
|
|
@@ -268,16 +259,18 @@ namespace NActors { | |||||
| if (!Params.UseExternalDataChannel || sections.empty()) { | ||||||
| // all data goes inline | ||||||
| IsPartInline = true; | ||||||
| IsPartRdma = false; | ||||||
| PartLenRemain = Max<size_t>(); | ||||||
| } else if (!Params.UseXdcShuffle || SendViaRdma) { | ||||||
| } else if (!Params.UseXdcShuffle) { | ||||||
| // when UseXdcShuffle feature is not supported by the remote side, we transfer whole event over XDC | ||||||
| // also when we use RDMA, we transfer whole over RDMA | ||||||
| IsPartInline = false; | ||||||
| IsPartRdma = false; | ||||||
| PartLenRemain = Max<size_t>(); | ||||||
| } else { | ||||||
| Y_ABORT_UNLESS(SectionIndex < sections.size()); | ||||||
| IsPartInline = sections[SectionIndex].IsInline; | ||||||
| while (SectionIndex < sections.size() && IsPartInline == sections[SectionIndex].IsInline) { | ||||||
| IsPartRdma = sections[SectionIndex].IsRdmaCapable; | ||||||
| while (SectionIndex < sections.size() && IsPartInline == sections[SectionIndex].IsInline && IsPartRdma == sections[SectionIndex].IsRdmaCapable) { | ||||||
| PartLenRemain += sections[SectionIndex].Size; | ||||||
| ++SectionIndex; | ||||||
| } | ||||||
|
|
@@ -288,8 +281,8 @@ namespace NActors { | |||||
| std::optional<bool> complete = false; | ||||||
| if (IsPartInline) { | ||||||
| complete = FeedInlinePayload(task, event); | ||||||
| } else if (SendViaRdma) { | ||||||
| complete = FeedRdmaPayload(task, event, rdmaDeviceIndex); | ||||||
| } else if (IsPartRdma) { | ||||||
| complete = FeedRdmaPayload(task, event, rdmaDeviceIndex, task.Params.ChecksumRdmaEvent); | ||||||
| } else { | ||||||
| complete = FeedExternalPayload(task, event); | ||||||
| } | ||||||
|
|
@@ -325,56 +318,54 @@ namespace NActors { | |||||
| return complete; | ||||||
| } | ||||||
|
|
||||||
| bool TEventOutputChannel::SerializeEventRdma(TEventHolder& event, NActorsInterconnect::TRdmaCreds& rdmaCreds, | ||||||
| ui32* checksum, ssize_t rdmaDeviceIndex) | ||||||
| { | ||||||
| bool TEventOutputChannel::SerializeEventRdma(TEventHolder& event) { | ||||||
| if (!event.Buffer && event.Event) { | ||||||
| std::optional<TRope> rope = event.Event->SerializeToRope(RdmaMemPool.get()); | ||||||
| std::optional<TRope> rope = event.Event->SerializeToRope(GetDefaultRcBufAllocator()); | ||||||
| if (!rope) { | ||||||
| return false; // serialization failed | ||||||
| } | ||||||
| event.Buffer = MakeIntrusive<TEventSerializedData>( | ||||||
| std::move(*rope), event.Event->CreateSerializationInfo() | ||||||
| ); | ||||||
| event.Event = nullptr; | ||||||
| Iter = event.Buffer->GetBeginIter(); | ||||||
| } | ||||||
|
|
||||||
| XXH3_state_t state; | ||||||
| if (checksum) { | ||||||
| XXH3_64bits_reset(&state); | ||||||
| } | ||||||
| return true; | ||||||
| } | ||||||
|
|
||||||
| std::optional<bool> TEventOutputChannel::FeedRdmaPayload(TTcpPacketOutTask& task, TEventHolder& event, ssize_t rdmaDeviceIndex, bool checksumming) { | ||||||
| Y_ABORT_UNLESS(rdmaDeviceIndex >= 0); | ||||||
|
|
||||||
| if (event.Buffer) { | ||||||
| for (; Iter.Valid(); ++Iter) { | ||||||
| Y_ABORT_UNLESS(event.Buffer); | ||||||
| if (RdmaCredsBuffer.CredsSize() == 0) { | ||||||
| auto prevIter = Iter; | ||||||
| size_t prevPartLenRemain = PartLenRemain; | ||||||
| for (; Iter.Valid() && PartLenRemain; ++Iter) { | ||||||
| TRcBuf buf = Iter.GetChunk(); | ||||||
| auto memReg = NInterconnect::NRdma::TryExtractFromRcBuf(buf); | ||||||
| if (memReg.Empty()) { | ||||||
| // TODO: may be copy to RDMA buffer ????? | ||||||
| Iter = event.Buffer->GetBeginIter(); | ||||||
| Iter = prevIter; | ||||||
| IsPartRdma = false; | ||||||
| RdmaCredsBuffer.Clear(); | ||||||
| PartLenRemain = prevPartLenRemain; | ||||||
| return false; | ||||||
| } | ||||||
| if (checksum) { | ||||||
| XXH3_64bits_update(&state, buf.GetData(), buf.GetSize()); | ||||||
| if (checksumming) { | ||||||
| XXH3_64bits_update(&RdmaChecksumState, buf.GetData(), buf.GetSize()); | ||||||
| } | ||||||
| auto cred = rdmaCreds.AddCreds(); | ||||||
| auto cred = RdmaCredsBuffer.AddCreds(); | ||||||
| cred->SetAddress(reinterpret_cast<ui64>(memReg.GetAddr())); | ||||||
| cred->SetSize(memReg.GetSize()); | ||||||
| cred->SetRkey(memReg.GetRKey(rdmaDeviceIndex)); | ||||||
|
|
||||||
| event.EventActuallySerialized += buf.GetSize(); | ||||||
| PartLenRemain -= buf.GetSize(); | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| if (checksum) { | ||||||
| *checksum = XXH3_64bits_digest(&state); | ||||||
| } | ||||||
| return true; | ||||||
| } | ||||||
|
|
||||||
| std::optional<bool> TEventOutputChannel::FeedRdmaPayload(TTcpPacketOutTask& task, TEventHolder& event, ssize_t rdmaDeviceIndex) { | ||||||
| Y_ABORT_UNLESS(rdmaDeviceIndex >= 0); | ||||||
| const NActorsInterconnect::TRdmaCreds& rdmaCreds = SendViaRdma->RdmaCreds; | ||||||
| ui32 checkSum = SendViaRdma->CheckSum; | ||||||
|
|
||||||
| ui16 credsSerializedSize = rdmaCreds.ByteSizeLong(); | ||||||
| Y_ABORT_UNLESS(PartLenRemain == 0); | ||||||
|
|
||||||
| ui16 credsSerializedSize = RdmaCredsBuffer.ByteSizeLong(); | ||||||
| // Part = | TChannelPart | EXdcCommand::RDMA_READ | rdmaCreds.Size | rdmaCreds | checkSum | | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lets rename to something like CumulativeChecksum to show what it is. |
||||||
| size_t partSize = sizeof(TChannelPart) + sizeof(ui8) + sizeof(ui16) + credsSerializedSize + sizeof(ui32); | ||||||
| Y_ABORT_UNLESS(partSize < 4096); | ||||||
|
|
@@ -396,20 +387,22 @@ namespace NActors { | |||||
| ptr += sizeof(ui16); | ||||||
|
|
||||||
| ui32 payloadSz = 0; | ||||||
| for (const auto& rdmaCred : rdmaCreds.GetCreds()) { | ||||||
| for (const auto& rdmaCred : RdmaCredsBuffer.GetCreds()) { | ||||||
| payloadSz += rdmaCred.GetSize(); | ||||||
| } | ||||||
|
|
||||||
| Y_ABORT_UNLESS(rdmaCreds.SerializePartialToArray(ptr, credsSerializedSize)); | ||||||
| Y_ABORT_UNLESS(RdmaCredsBuffer.SerializePartialToArray(ptr, credsSerializedSize)); | ||||||
| ptr += credsSerializedSize; | ||||||
| WriteUnaligned<ui32>(ptr, checkSum); | ||||||
| OutputQueueSize -= event.EventSerializedSize; | ||||||
| WriteUnaligned<ui32>(ptr, checksumming ? XXH3_64bits_digest(&RdmaChecksumState) : 0); | ||||||
| OutputQueueSize -= payloadSz; | ||||||
|
|
||||||
| task.Write<false>(buffer, partSize); | ||||||
|
|
||||||
| task.AttachRdmaPayloadSize(payloadSz); | ||||||
|
|
||||||
| return true; | ||||||
| RdmaCredsBuffer.Clear(); | ||||||
|
|
||||||
| return !Iter.Valid(); | ||||||
|
||||||
| return !Iter.Valid(); | |
| return !Iter.Valid() || PartLenRemain == 0; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[nitpick] The trailing whitespace on line 368 should be removed to maintain code cleanliness and consistency.