diff --git a/ydb/core/mind/hive/balancer.cpp b/ydb/core/mind/hive/balancer.cpp index 620f502eead6..6ef346268326 100644 --- a/ydb/core/mind/hive/balancer.cpp +++ b/ydb/core/mind/hive/balancer.cpp @@ -325,9 +325,9 @@ class THiveBalancer : public NActors::TActorBootstrapped, public tablet->ActorsToNotifyOnRestart.emplace_back(SelfId()); // volatile settings, will not persist upon restart ++KickInFlight; ++Movements; - BLOG_D("Balancer moving tablet " << tablet->ToString() << " " << tablet->GetResourceValues() - << " from node " << tablet->Node->Id << " " << tablet->Node->ResourceValues - << " to node " << node->Id << " " << node->ResourceValues); + BLOG_D("Balancer moving tablet " << tablet->ToString() + << " from node " << tablet->Node->Id + << " to node " << node->Id); Hive->RecordTabletMove(THive::TTabletMoveInfo(now, *tablet, tablet->Node->Id, node->Id)); Hive->Execute(Hive->CreateRestartTablet(tablet->GetFullTabletId(), node->Id)); UpdateProgress(); diff --git a/ydb/core/mind/hive/drain.cpp b/ydb/core/mind/hive/drain.cpp index 6480096fdd0a..3e1d3d00e3a2 100644 --- a/ydb/core/mind/hive/drain.cpp +++ b/ydb/core/mind/hive/drain.cpp @@ -85,17 +85,17 @@ class THiveDrain : public NActors::TActorBootstrapped, public ISubAc ++KickInFlight; ++Movements; BLOG_D("Drain " << SelfId() << " moving tablet " - << tablet->ToString() << " " << tablet->GetResourceValues() - << " from node " << tablet->Node->Id << " " << tablet->Node->ResourceValues - << " to node " << node->Id << " " << node->ResourceValues); + << tablet->ToString() + << " from node " << tablet->Node->Id + << " to node " << node->Id); Hive->TabletCounters->Cumulative()[NHive::COUNTER_DRAIN_EXECUTED].Increment(1); Hive->RecordTabletMove(THive::TTabletMoveInfo(TInstant::Now(), *tablet, tablet->Node->Id, node->Id)); Hive->Execute(Hive->CreateRestartTablet(tabletId, node->Id)); } else { if (std::holds_alternative(result) || std::holds_alternative(result)) { Hive->TabletCounters->Cumulative()[NHive::COUNTER_DRAIN_FAILED].Increment(1); - BLOG_D("Drain " << SelfId() << " could not move tablet " << tablet->ToString() << " " << tablet->GetResourceValues() - << " from node " << tablet->Node->Id << " " << tablet->Node->ResourceValues); + BLOG_D("Drain " << SelfId() << " could not move tablet " << tablet->ToString() + << " from node " << tablet->Node->Id); } else if (std::holds_alternative(result)){ BLOG_D("Drain " << SelfId() << " could not move tablet " << tablet->ToString() << " and will try again later"); Hive->WaitToMoveTablets(SelfId()); diff --git a/ydb/core/mind/hive/fill.cpp b/ydb/core/mind/hive/fill.cpp index a4fd3750d5b6..dfa6a8f8b2f8 100644 --- a/ydb/core/mind/hive/fill.cpp +++ b/ydb/core/mind/hive/fill.cpp @@ -61,9 +61,9 @@ class THiveFill : public NActors::TActorBootstrapped, public ISubActo tablet->ActorsToNotifyOnRestart.emplace_back(SelfId()); // volatile settings, will not persist upon restart ++KickInFlight; ++Movements; - BLOG_D("Fill " << SelfId() << " moving tablet " << tablet->ToString() << " " << tablet->GetResourceValues() - << " from node " << tablet->Node->Id << " " << tablet->Node->ResourceValues - << " to node " << node->Id << " " << node->ResourceValues); + BLOG_D("Fill " << SelfId() << " moving tablet " << tablet->ToString() + << " from node " << tablet->Node->Id + << " to node " << node->Id); Hive->TabletCounters->Cumulative()[NHive::COUNTER_FILL_EXECUTED].Increment(1); Hive->RecordTabletMove(THive::TTabletMoveInfo(TInstant::Now(), *tablet, tablet->Node->Id, node->Id)); Hive->Execute(Hive->CreateRestartTablet(tablet->GetFullTabletId(), node->Id), ctx); diff --git a/ydb/core/mind/hive/hive.cpp b/ydb/core/mind/hive/hive.cpp index ac5ee9e6d3e2..ca0d336f5c3f 100644 --- a/ydb/core/mind/hive/hive.cpp +++ b/ydb/core/mind/hive/hive.cpp @@ -119,6 +119,53 @@ bool TNodeFilter::IsAllowedPile(TBridgePileId pile) const { } } +TMetrics& TMetrics::operator+=(const TMetrics& other) { + CPU += other.CPU; + Memory += other.Memory; + Network += other.Network; + Counter += other.Counter; + Storage += other.Storage; + ReadThroughput += other.ReadThroughput; + WriteThroughput += other.WriteThroughput; + ReadIops += other.ReadIops; + WriteIops += other.WriteIops; + return *this; +} + +void TMetrics::ToProto(NKikimrTabletBase::TMetrics* proto) const { + if (CPU) { + proto->SetCPU(CPU); + } + if (Memory) { + proto->SetMemory(Memory); + } + if (Network) { + proto->SetNetwork(Network); + } + if (Counter) { + proto->SetCounter(Counter); + } + if (Storage) { + proto->SetStorage(Storage); + } + if (ReadThroughput) { + proto->SetReadThroughput(ReadThroughput); + } + if (WriteThroughput) { + proto->SetWriteThroughput(WriteThroughput); + } + if (ReadIops) { + proto->SetReadIops(ReadIops); + } + if (WriteIops) { + proto->SetWriteIops(WriteIops); + } + proto->MutableGroupReadThroughput()->Assign(GroupReadThroughput.begin(), GroupReadThroughput.end()); + proto->MutableGroupWriteThroughput()->Assign(GroupWriteThroughput.begin(), GroupWriteThroughput.end()); + proto->MutableGroupReadIops()->Assign(GroupReadIops.begin(), GroupReadIops.end()); + proto->MutableGroupWriteIops()->Assign(GroupWriteIops.begin(), GroupWriteIops.end()); +} + template std::unordered_map MakeReverseMap(const std::unordered_map& map) { std::unordered_map result; diff --git a/ydb/core/mind/hive/hive.h b/ydb/core/mind/hive/hive.h index 871a5db95cc9..192fdf82fbb6 100644 --- a/ydb/core/mind/hive/hive.h +++ b/ydb/core/mind/hive/hive.h @@ -381,6 +381,27 @@ struct TFollowerUpdates { } }; +// same as NKikimrTabletBase::TMetrics, except not a protobuf - for lighter operations +struct TMetrics { + ui64 CPU = 0; + ui64 Memory = 0; + ui64 Network = 0; + ui64 Counter = 0; + ui64 Storage = 0; + TVector GroupReadThroughput; + TVector GroupWriteThroughput; + ui64 ReadThroughput = 0; + ui64 WriteThroughput = 0; + TVector GroupReadIops; + TVector GroupWriteIops; + ui64 ReadIops = 0; + ui64 WriteIops = 0; + + TMetrics& operator+=(const TMetrics& other); + + void ToProto(NKikimrTabletBase::TMetrics* proto) const; +}; + } // NHive } // NKikimr diff --git a/ydb/core/mind/hive/hive_events.h b/ydb/core/mind/hive/hive_events.h index e0e199b4c27c..dc19d98ab688 100644 --- a/ydb/core/mind/hive/hive_events.h +++ b/ydb/core/mind/hive/hive_events.h @@ -39,6 +39,7 @@ struct TEvPrivate { EvRefreshScaleRecommendation, EvUpdateFollowers, EvUpdateBalanceCounters, + EvProcessMetrics, EvEnd }; @@ -145,6 +146,8 @@ struct TEvPrivate { }; struct TEvUpdateBalanceCounters : TEventLocal {}; + + struct TEvProcessMetrics : TEventLocal {}; }; } // NHive diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp index 3e3b98552ba9..f44b9a65c9d0 100644 --- a/ydb/core/mind/hive/hive_impl.cpp +++ b/ydb/core/mind/hive/hive_impl.cpp @@ -1942,7 +1942,7 @@ void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabl } tabletInfo.SetRestartsPerPeriod(info->GetRestartsPerPeriod(restartsBarrierTime)); if (req.GetReturnMetrics()) { - tabletInfo.MutableMetrics()->CopyFrom(info->GetResourceValues()); + info->GetResourceValues().ToProto(tabletInfo.MutableMetrics()); } if (info->InWaitQueue) { tabletInfo.SetInWaitQueue(true); @@ -1976,7 +1976,7 @@ void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabl } tabletInfo.SetRestartsPerPeriod(follower.GetRestartsPerPeriod(restartsBarrierTime)); if (req.GetReturnMetrics()) { - tabletInfo.MutableMetrics()->CopyFrom(follower.GetResourceValues()); + follower.GetResourceValues().ToProto(tabletInfo.MutableMetrics()); } } } @@ -2047,7 +2047,7 @@ void THive::Handle(TEvHive::TEvRequestHiveDomainStats::TPtr& ev) { THashMap StateCounter; THashSet NodeIds; ui32 AliveNodes = 0; - NKikimrTabletBase::TMetrics Metrics; + TMetrics Metrics; }; THashMap subDomainStats; @@ -2081,7 +2081,7 @@ void THive::Handle(TEvHive::TEvRequestHiveDomainStats::TPtr& ev) { domainStats.SetShardId(pr1.first.first); domainStats.SetPathId(pr1.first.second); if (ev->Get()->Record.GetReturnMetrics()) { - domainStats.MutableMetrics()->CopyFrom(pr1.second.Metrics); + pr1.second.Metrics.ToProto(domainStats.MutableMetrics()); } for (const auto& pr2 : pr1.second.StateCounter) { auto& stateStats = *domainStats.AddStateStats(); @@ -2432,6 +2432,16 @@ std::optional THive::CheckScatter(const TResourceNormalizedV return std::nullopt; } +void THive::EnqueueUpdateMetrics(TTabletInfo* tablet) { + if (std::exchange(tablet->UpdateMetricsEnqueued, true)) { + return; + } + ProcessMetricsQueue.push(tablet->GetFullTabletId()); + if (!std::exchange(ProcessMetricsScheduled, true)) { + Schedule(TDuration::MilliSeconds(5), new TEvPrivate::TEvProcessMetrics); + } +} + void THive::HandleInit(TEvPrivate::TEvProcessTabletBalancer::TPtr&) { BLOG_W("Received TEvProcessTabletBalancer while in StateInit"); Schedule(TDuration::Seconds(1), new TEvPrivate::TEvProcessTabletBalancer()); @@ -2611,8 +2621,8 @@ void THive::Handle(TEvPrivate::TEvProcessStorageBalancer::TPtr&) { void THive::UpdateTotalResourceValues( const TNodeInfo* node, const TTabletInfo* tablet, - const NKikimrTabletBase::TMetrics& before, - const NKikimrTabletBase::TMetrics& after, + const TMetrics& before, + const TMetrics& after, TResourceRawValues deltaRaw, TResourceNormalizedValues deltaNormalized) { TotalRawResourceValues = TotalRawResourceValues + deltaRaw; @@ -2633,15 +2643,9 @@ void THive::UpdateTotalResourceValues( auto& objectMetrics = ObjectToTabletMetrics[tablet->GetObjectId()]; auto beforeMetrics = objectMetrics.Metrics; objectMetrics.AggregateDiff(before, after, tablet); - BLOG_TRACE("UpdateTotalResources: ObjectId " << tablet->GetObjectId() << - ": {" << beforeMetrics.ShortDebugString() << - "} -> {" << objectMetrics.Metrics.ShortDebugString() << "}"); auto& typeMetrics = TabletTypeToTabletMetrics[tablet->GetTabletType()]; beforeMetrics = typeMetrics.Metrics; typeMetrics.AggregateDiff(before, after, tablet); - BLOG_TRACE("UpdateTotalResources: Type " << tablet->GetTabletType() << - ": {" << beforeMetrics.ShortDebugString() << - "} -> {" << typeMetrics.Metrics.ShortDebugString() << "}"); } TabletCounters->Simple()[NHive::COUNTER_METRICS_COUNTER].Set(std::get(TotalRawResourceValues)); TabletCounters->Simple()[NHive::COUNTER_METRICS_CPU].Set(std::get(TotalRawResourceValues)); @@ -2676,20 +2680,20 @@ void THive::WaitToMoveTablets(TActorId actor) { } } -bool THive::IsValidMetrics(const NKikimrTabletBase::TMetrics& metrics) { +bool THive::IsValidMetrics(const TMetrics& metrics) { return IsValidMetricsCPU(metrics) || IsValidMetricsMemory(metrics) || IsValidMetricsNetwork(metrics); } -bool THive::IsValidMetricsCPU(const NKikimrTabletBase::TMetrics& metrics) { - return metrics.GetCPU() > 1'000/*1ms*/; +bool THive::IsValidMetricsCPU(const TMetrics& metrics) { + return metrics.CPU > 1'000/*1ms*/; } -bool THive::IsValidMetricsMemory(const NKikimrTabletBase::TMetrics& metrics) { - return metrics.GetMemory() > 128'000/*128KB*/; +bool THive::IsValidMetricsMemory(const TMetrics& metrics) { + return metrics.Memory > 128'000/*128KB*/; } -bool THive::IsValidMetricsNetwork(const NKikimrTabletBase::TMetrics& metrics) { - return metrics.GetNetwork() > 1024/*1KBps*/; +bool THive::IsValidMetricsNetwork(const TMetrics& metrics) { + return metrics.Network > 1024/*1KBps*/; } TString THive::DebugDomainsActiveNodes() const { @@ -2706,67 +2710,67 @@ void THive::AggregateMetricsMax(NKikimrTabletBase::TMetrics& aggregate, const NK aggregate.SetWriteThroughput(std::max(aggregate.GetWriteThroughput(), value.GetWriteThroughput())); } -template -static void AggregateDiff(NKikimrTabletBase::TMetrics& aggregate, const NKikimrTabletBase::TMetrics& before, const NKikimrTabletBase::TMetrics& after, TTabletId tabletId, const TString& name) { - i64 oldValue = (aggregate.*getter)(); - i64 delta = (after.*getter)() - (before.*getter)(); +template +static void AggregateDiff(TMetrics& aggregate, const TMetrics& before, const TMetrics& after, TTabletId tabletId, const TString& name) { + i64 oldValue = aggregate.*field; + i64 delta = after.*field - before.*field; i64 newValue = oldValue + delta; Y_ENSURE_LOG(newValue >= 0, "tablet " << tabletId << " name=" << name << " oldValue=" << oldValue << " delta=" << delta << " newValue=" << newValue); newValue = Max(newValue, (i64)0); - if (newValue != 0) { - (aggregate.*setter)(newValue); - } else { - (aggregate.*clear)(); - } -} - -void THive::AggregateMetricsDiff(NKikimrTabletBase::TMetrics& aggregate, const NKikimrTabletBase::TMetrics& before, const NKikimrTabletBase::TMetrics& after, const TTabletInfo* tablet) { - AggregateDiff<&NKikimrTabletBase::TMetrics::SetCPU, &NKikimrTabletBase::TMetrics::GetCPU, &NKikimrTabletBase::TMetrics::ClearCPU>(aggregate, before, after, tablet->GetLeader().Id, "cpu"); - AggregateDiff<&NKikimrTabletBase::TMetrics::SetMemory, &NKikimrTabletBase::TMetrics::GetMemory, &NKikimrTabletBase::TMetrics::ClearMemory>(aggregate, before, after, tablet->GetLeader().Id, "memory"); - AggregateDiff<&NKikimrTabletBase::TMetrics::SetNetwork, &NKikimrTabletBase::TMetrics::GetNetwork, &NKikimrTabletBase::TMetrics::ClearNetwork>(aggregate, before, after, tablet->GetLeader().Id, "network"); - AggregateDiff<&NKikimrTabletBase::TMetrics::SetCounter, &NKikimrTabletBase::TMetrics::GetCounter, &NKikimrTabletBase::TMetrics::ClearCounter>(aggregate, before, after, tablet->GetLeader().Id, "counter"); - AggregateDiff<&NKikimrTabletBase::TMetrics::SetStorage, &NKikimrTabletBase::TMetrics::GetStorage, &NKikimrTabletBase::TMetrics::ClearStorage>(aggregate, before, after, tablet->GetLeader().Id, "storage"); - AggregateDiff<&NKikimrTabletBase::TMetrics::SetReadThroughput, &NKikimrTabletBase::TMetrics::GetReadThroughput, &NKikimrTabletBase::TMetrics::ClearReadThroughput>(aggregate, before, after, tablet->GetLeader().Id, "read"); - AggregateDiff<&NKikimrTabletBase::TMetrics::SetWriteThroughput, &NKikimrTabletBase::TMetrics::GetWriteThroughput, &NKikimrTabletBase::TMetrics::ClearWriteThroughput>(aggregate, before, after, tablet->GetLeader().Id, "write"); -} - -void THive::DivideMetrics(NKikimrTabletBase::TMetrics& metrics, ui64 divider) { - metrics.SetCPU(metrics.GetCPU() / divider); - metrics.SetMemory(metrics.GetMemory() / divider); - metrics.SetNetwork(metrics.GetNetwork() / divider); - metrics.SetCounter(metrics.GetCounter() / divider); - metrics.SetStorage(metrics.GetStorage() / divider); - metrics.SetReadThroughput(metrics.GetReadThroughput() / divider); - metrics.SetWriteThroughput(metrics.GetWriteThroughput() / divider); -} - -NKikimrTabletBase::TMetrics THive::GetDefaultResourceValuesForObject(TFullObjectId objectId) { - NKikimrTabletBase::TMetrics metrics; + aggregate.*field = newValue; +} + +void THive::AggregateMetricsDiff(TMetrics& aggregate, const TMetrics& before, const TMetrics& after, const TTabletInfo* tablet) { + AggregateDiff<&TMetrics::CPU>(aggregate, before, after, tablet->GetLeader().Id, "cpu"); + AggregateDiff<&TMetrics::Memory>(aggregate, before, after, tablet->GetLeader().Id, "memory"); + AggregateDiff<&TMetrics::Network>(aggregate, before, after, tablet->GetLeader().Id, "network"); + AggregateDiff<&TMetrics::Counter>(aggregate, before, after, tablet->GetLeader().Id, "counter"); + AggregateDiff<&TMetrics::Storage>(aggregate, before, after, tablet->GetLeader().Id, "storage"); + AggregateDiff<&TMetrics::ReadThroughput>(aggregate, before, after, tablet->GetLeader().Id, "read"); + AggregateDiff<&TMetrics::WriteThroughput>(aggregate, before, after, tablet->GetLeader().Id, "write"); + AggregateDiff<&TMetrics::ReadIops>(aggregate, before, after, tablet->GetLeader().Id, "readiops"); + AggregateDiff<&TMetrics::WriteIops>(aggregate, before, after, tablet->GetLeader().Id, "writeiops"); +} + +void THive::DivideMetrics(TMetrics& metrics, ui64 divider) { + metrics.CPU /= divider; + metrics.Memory /= divider; + metrics.Network /= divider; + metrics.Counter /= divider; + metrics.Storage /= divider; + metrics.ReadThroughput /= divider; + metrics.WriteThroughput /= divider; + metrics.ReadIops /= divider; + metrics.WriteIops /= divider; +} + +TMetrics THive::GetDefaultResourceValuesForObject(TFullObjectId objectId) { + TMetrics metrics; auto itTablets = ObjectToTabletMetrics.find(objectId); if (itTablets != ObjectToTabletMetrics.end()) { metrics = itTablets->second.GetAverage(); - metrics.ClearCounter(); + metrics.Counter = 0; } return metrics; } -NKikimrTabletBase::TMetrics THive::GetDefaultResourceValuesForTabletType(TTabletTypes::EType type) { - NKikimrTabletBase::TMetrics metrics; +TMetrics THive::GetDefaultResourceValuesForTabletType(TTabletTypes::EType type) { + TMetrics metrics; auto it = TabletTypeToTabletMetrics.find(type); if (it != TabletTypeToTabletMetrics.end()) { metrics = it->second.GetAverage(); - metrics.ClearCounter(); + metrics.Counter = 0; } return metrics; } -NKikimrTabletBase::TMetrics THive::GetDefaultResourceValuesForProfile(TTabletTypes::EType type, const TString& resourceProfile) { - NKikimrTabletBase::TMetrics resourceValues; +TMetrics THive::GetDefaultResourceValuesForProfile(TTabletTypes::EType type, const TString& resourceProfile) { + TMetrics resourceValues; // copy default resource usage from resource profile if (ResourceProfiles) { // TODO: provide Hive with resource profile used by the tablet instead of default one. auto profile = ResourceProfiles->GetProfile(type, resourceProfile); - resourceValues.SetMemory(profile->GetDefaultTabletMemoryUsage()); + resourceValues.Memory = profile->GetDefaultTabletMemoryUsage(); } return resourceValues; } @@ -3248,6 +3252,7 @@ void THive::ProcessEvent(std::unique_ptr event) { hFunc(TEvPrivate::TEvUpdateBalanceCounters, Handle); hFunc(TEvHive::TEvSetDown, Handle); hFunc(TEvHive::TEvRequestDrainInfo, Handle); + hFunc(TEvPrivate::TEvProcessMetrics, Handle); } } @@ -3362,6 +3367,7 @@ STFUNC(THive::StateWork) { fFunc(TEvPrivate::TEvUpdateBalanceCounters::EventType, EnqueueIncomingEvent); fFunc(TEvHive::TEvRequestDrainInfo::EventType, EnqueueIncomingEvent); fFunc(TEvHive::TEvSetDown::EventType, EnqueueIncomingEvent); + fFunc(TEvPrivate::TEvProcessMetrics::EventType, EnqueueIncomingEvent); hFunc(TEvPrivate::TEvProcessIncomingEvent, Handle); default: if (!HandleDefaultEvents(ev, SelfId())) { @@ -3470,9 +3476,9 @@ void THive::Handle(NSysView::TEvSysView::TEvGetTabletsRequest::TPtr& ev) { entry->SetNodeId(tablet.NodeId); const auto& resourceValues = tablet.GetResourceValues(); - entry->SetCPU(resourceValues.GetCPU()); - entry->SetMemory(resourceValues.GetMemory()); - entry->SetNetwork(resourceValues.GetNetwork()); + entry->SetCPU(resourceValues.CPU); + entry->SetMemory(resourceValues.Memory); + entry->SetNetwork(resourceValues.Network); for (const auto& follower : tablet.Followers) { auto* entry = record.AddEntries(); @@ -3489,9 +3495,9 @@ void THive::Handle(NSysView::TEvSysView::TEvGetTabletsRequest::TPtr& ev) { entry->SetNodeId(follower.NodeId); const auto& resourceValues = follower.GetResourceValues(); - entry->SetCPU(resourceValues.GetCPU()); - entry->SetMemory(resourceValues.GetMemory()); - entry->SetNetwork(resourceValues.GetNetwork()); + entry->SetCPU(resourceValues.CPU); + entry->SetMemory(resourceValues.Memory); + entry->SetNetwork(resourceValues.Network); } if (count >= limit && i < request.TabletIdsSize() - 1) { @@ -3741,6 +3747,10 @@ void THive::Handle(TEvHive::TEvSetDown::TPtr& ev) { Execute(CreateSetDown(ev)); } +void THive::Handle(TEvPrivate::TEvProcessMetrics::TPtr&) { + Execute(CreateProcessMetrics()); +} + void THive::MakeScaleRecommendation() { BLOG_D("[MSR] Started"); diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h index a5f80bda0f43..4c107590b879 100644 --- a/ydb/core/mind/hive/hive_impl.h +++ b/ydb/core/mind/hive/hive_impl.h @@ -125,6 +125,7 @@ namespace NHive { TResourceRawValues ResourceRawValuesFromMetrics(const NKikimrTabletBase::TMetrics& metrics); NKikimrTabletBase::TMetrics MetricsFromResourceRawValues(const TResourceRawValues& values); TResourceRawValues ResourceRawValuesFromMetrics(const NKikimrHive::TTabletMetrics& tabletMetrics); +TResourceRawValues ResourceRawValuesFromMetrics(const TMetrics& tabletMetrics); TString GetResourceValuesText(const NKikimrTabletBase::TMetrics& values); TString GetResourceValuesText(const TTabletInfo& tablet); TString GetResourceValuesText(const TResourceRawValues& values); @@ -135,6 +136,7 @@ TString GetResourceValuesHtml(const TResourceRawValues& values); NJson::TJsonValue GetResourceValuesJson(const TResourceRawValues& values); NJson::TJsonValue GetResourceValuesJson(const TResourceRawValues& values, const TResourceRawValues& maximum); TString GetResourceValuesHtml(const NKikimrTabletBase::TMetrics& values); +TString GetResourceValuesHtml(const TMetrics& values); NJson::TJsonValue GetResourceValuesJson(const NKikimrTabletBase::TMetrics& values); ui64 GetReadThroughput(const NKikimrTabletBase::TMetrics& values); ui64 GetWriteThroughput(const NKikimrTabletBase::TMetrics& values); @@ -246,6 +248,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar friend class TTxMonEvent_StopDomain; friend class TTxUpdatePiles; friend class TTxSetDown; + friend class TTxProcessMetrics; friend class TDeleteTabletActor; @@ -316,6 +319,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar ITransaction* CreateConfigureScaleRecommender(TEvHive::TEvConfigureScaleRecommender::TPtr event); ITransaction* CreateUpdatePiles(); ITransaction* CreateSetDown(TEvHive::TEvSetDown::TPtr& event); + ITransaction* CreateProcessMetrics(); public: TDomainsView DomainsView; @@ -354,7 +358,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar std::optional GetPipeToTenantHive(const TNodeInfo* node); struct TAggregateMetrics { - NKikimrTabletBase::TMetrics Metrics; + TMetrics Metrics; ui64 Counter = 0; void IncreaseCount(ui64 value = 1) { @@ -366,14 +370,14 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar --Counter; } - void AggregateDiff(const NKikimrTabletBase::TMetrics& before, const NKikimrTabletBase::TMetrics& after, const TTabletInfo* tablet) { + void AggregateDiff(const TMetrics& before, const TMetrics& after, const TTabletInfo* tablet) { AggregateMetricsDiff(Metrics, before, after, tablet); } - NKikimrTabletBase::TMetrics GetAverage() const { - NKikimrTabletBase::TMetrics metrics; + TMetrics GetAverage() const { + TMetrics metrics; if (Counter > 0) { - metrics.CopyFrom(Metrics); + metrics = Metrics; DivideMetrics(metrics, Counter); } return metrics; @@ -418,6 +422,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar bool LogTabletMovesScheduled = false; bool ProcessStorageBalancerScheduled = false; bool ProcessFollowerUpdatesScheduled = false; + bool ProcessMetricsScheduled = false; TResourceRawValues TotalRawResourceValues = {}; TResourceNormalizedValues TotalNormalizedResourceValues = {}; TInstant LastResourceChangeReaction; @@ -439,6 +444,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar std::queue StopTenantTabletsQueue; std::queue ResumeTenantTabletsQueue; bool NotEnoughResources = false; + std::queue ProcessMetricsQueue; struct TPendingCreateTablet { NKikimrHive::TEvCreateTablet CreateTablet; @@ -613,6 +619,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar void Handle(TEvPrivate::TEvUpdateBalanceCounters::TPtr& ev); void Handle(TEvHive::TEvRequestDrainInfo::TPtr& ev); void Handle(TEvHive::TEvSetDown::TPtr& ev); + void Handle(TEvPrivate::TEvProcessMetrics::TPtr& ev); protected: void RestartPipeTx(ui64 tabletId); @@ -709,15 +716,15 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId void ProcessStorageBalancer(); const TVector& GetTabletTypeAllowedMetricIds(TTabletTypes::EType type) const; static const TVector& GetDefaultAllowedMetricIdsForType(TTabletTypes::EType type); - static bool IsValidMetrics(const NKikimrTabletBase::TMetrics& metrics); - static bool IsValidMetricsCPU(const NKikimrTabletBase::TMetrics& metrics); - static bool IsValidMetricsMemory(const NKikimrTabletBase::TMetrics& metrics); - static bool IsValidMetricsNetwork(const NKikimrTabletBase::TMetrics& metrics); + static bool IsValidMetrics(const TMetrics& metrics); + static bool IsValidMetricsCPU(const TMetrics& metrics); + static bool IsValidMetricsMemory(const TMetrics& metrics); + static bool IsValidMetricsNetwork(const TMetrics& metrics); void UpdateTotalResourceValues( const TNodeInfo* node, const TTabletInfo* tablet, - const NKikimrTabletBase::TMetrics& before, - const NKikimrTabletBase::TMetrics& after, + const TMetrics& before, + const TMetrics& after, NKikimr::NHive::TResourceRawValues deltaRaw, NKikimr::NHive::TResourceNormalizedValues deltaNormalized); void FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabletId, const TLeaderTabletInfo* info, const NKikimrHive::TEvRequestHiveInfo& req); @@ -1061,6 +1068,7 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId double GetUsage() const; // If the scatter is considered okay, returns nullopt. Otherwise, returns the resource that should be better balanced. std::optional CheckScatter(const TResourceNormalizedValues& scatterByResource) const; + void EnqueueUpdateMetrics(TTabletInfo* tablet); struct THiveStats { struct TNodeStat { @@ -1092,15 +1100,15 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId bool StopSubActor(TSubActorId subActorId); void WaitToMoveTablets(TActorId actor); const NKikimrLocal::TLocalConfig &GetLocalConfig() const { return LocalConfig; } - NKikimrTabletBase::TMetrics GetDefaultResourceValuesForObject(TFullObjectId objectId); - NKikimrTabletBase::TMetrics GetDefaultResourceValuesForTabletType(TTabletTypes::EType type); - NKikimrTabletBase::TMetrics GetDefaultResourceValuesForProfile(TTabletTypes::EType type, const TString& resourceProfile); + TMetrics GetDefaultResourceValuesForObject(TFullObjectId objectId); + TMetrics GetDefaultResourceValuesForTabletType(TTabletTypes::EType type); + TMetrics GetDefaultResourceValuesForProfile(TTabletTypes::EType type, const TString& resourceProfile); static void AggregateMetricsMax(NKikimrTabletBase::TMetrics& aggregate, const NKikimrTabletBase::TMetrics& value); - static void AggregateMetricsDiff(NKikimrTabletBase::TMetrics& aggregate, - const NKikimrTabletBase::TMetrics& before, - const NKikimrTabletBase::TMetrics& after, + static void AggregateMetricsDiff(TMetrics& aggregate, + const TMetrics& before, + const TMetrics& after, const TTabletInfo* tablet); - static void DivideMetrics(NKikimrTabletBase::TMetrics& metrics, ui64 divider); + static void DivideMetrics(TMetrics& metrics, ui64 divider); TVector UpdateStoragePools(const google::protobuf::RepeatedPtrField& groups); void InitDefaultChannelBind(TChannelBind& bind); void RequestPoolsInformation(); diff --git a/ydb/core/mind/hive/hive_impl_ut.cpp b/ydb/core/mind/hive/hive_impl_ut.cpp index 315bd5f77d3d..c7cb09606c5c 100644 --- a/ydb/core/mind/hive/hive_impl_ut.cpp +++ b/ydb/core/mind/hive/hive_impl_ut.cpp @@ -207,7 +207,7 @@ Y_UNIT_TEST_SUITE(THiveImplTest) { for (ui64 i = 0; i < NUM_TABLETS; ++i) { TLeaderTabletInfo& tablet = allTablets.emplace(std::piecewise_construct, std::tuple(i), std::tuple(i, hive)).first->second; - tablet.GetMutableResourceValues().SetMemory(RandomNumber()); + tablet.GetMutableResourceValues().Memory = RandomNumber(); } Ctest << "HIVE_TABLET_BALANCE_STRATEGY_HEAVIEST" << Endl; diff --git a/ydb/core/mind/hive/hive_statics.cpp b/ydb/core/mind/hive/hive_statics.cpp index 8256f59b25d1..1c19d921265c 100644 --- a/ydb/core/mind/hive/hive_statics.cpp +++ b/ydb/core/mind/hive/hive_statics.cpp @@ -22,6 +22,15 @@ TResourceRawValues ResourceRawValuesFromMetrics(const NKikimrTabletBase::TMetric return values; } +TResourceRawValues ResourceRawValuesFromMetrics(const TMetrics& metrics) { + TResourceRawValues values = {}; + std::get(values) = metrics.Counter; + std::get(values) = metrics.CPU; + std::get(values) = metrics.Memory; + std::get(values) = metrics.Network; + return values; +} + NKikimrTabletBase::TMetrics MetricsFromResourceRawValues(const TResourceRawValues& values) { NKikimrTabletBase::TMetrics metrics; if (std::get(values) != 0) { @@ -120,25 +129,25 @@ TString GetResourceValuesText(const TTabletInfo& tablet) { str << '('; str << GetConditionalBoldString( GetConditionalGreyString( - GetCounter(values.GetCounter()), + GetCounter(values.Counter), Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCounterFieldNumber) == allowedMetricIds.end()), resource == NMetrics::EResource::Counter); str << ", "; str << GetConditionalBoldString( GetConditionalGreyString( - GetTimes(values.GetCPU()), + GetTimes(values.CPU), Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) == allowedMetricIds.end()), resource == NMetrics::EResource::CPU); str << ", "; str << GetConditionalBoldString( GetConditionalGreyString( - GetBytes(values.GetMemory()), + GetBytes(values.Memory), Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) == allowedMetricIds.end()), resource == NMetrics::EResource::Memory); str << ", "; str << GetConditionalBoldString( GetConditionalGreyString( - GetBytesPerSecond(values.GetNetwork()), + GetBytesPerSecond(values.Network), Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) == allowedMetricIds.end()), resource == NMetrics::EResource::Network); str << ')'; @@ -233,17 +242,17 @@ TString GetValueWithColoredGlyph(double val, double maxVal) { return Sprintf("%.2f", val) + glyph; } -ui64 GetReadThroughput(const NKikimrTabletBase::TMetrics& values) { +ui64 GetReadThroughput(const TMetrics& values) { ui64 acc = 0; - for (const auto& throughput : values.GetGroupReadThroughput()) { + for (const auto& throughput : values.GroupReadThroughput) { acc += throughput.GetThroughput(); } return acc; } -ui64 GetWriteThroughput(const NKikimrTabletBase::TMetrics& values) { +ui64 GetWriteThroughput(const TMetrics& values) { ui64 acc = 0; - for (const auto& throughput : values.GetGroupWriteThroughput()) { + for (const auto& throughput : values.GroupWriteThroughput) { acc += throughput.GetThroughput(); } return acc; @@ -291,18 +300,18 @@ TString GetTimes(i64 times, const TString& zero) { return Sprintf("%.2f%%", (double)times * 100 / 1000000); } -TString GetResourceValuesHtml(const NKikimrTabletBase::TMetrics& values) { +TString GetResourceValuesHtml(const TMetrics& values) { TStringStream str; str << ""; - str << GetCounter(values.GetCounter(), TString()); - str << ""; - str << GetTimes(values.GetCPU(), TString()); - str << ""; - str << GetBytes(values.GetMemory(), TString()); - str << ""; - str << GetBytesPerSecond(values.GetNetwork(), TString()); - str << ""; - str << GetBytes(values.GetStorage(), TString()); + str << GetCounter(values.Counter, TString()); + str << ""; + str << GetTimes(values.CPU, TString()); + str << ""; + str << GetBytes(values.Memory, TString()); + str << ""; + str << GetBytesPerSecond(values.Network, TString()); + str << ""; + str << GetBytes(values.Storage, TString()); ui64 bytes = GetReadThroughput(values); str << ""; str << GetBytesPerSecond(bytes, TString()); @@ -313,18 +322,6 @@ TString GetResourceValuesHtml(const NKikimrTabletBase::TMetrics& values) { return str.Str(); } -NJson::TJsonValue GetResourceValuesJson(const NKikimrTabletBase::TMetrics& values) { - NJson::TJsonValue value; - value.AppendValue(GetCounter(values.GetCounter())); - value.AppendValue(GetTimes(values.GetCPU())); - value.AppendValue(GetBytes(values.GetMemory())); - value.AppendValue(GetBytesPerSecond(values.GetNetwork())); - value.AppendValue(GetBytes(values.GetStorage())); - value.AppendValue(GetBytesPerSecond(GetReadThroughput(values))); - value.AppendValue(GetBytesPerSecond(GetWriteThroughput(values))); - return value; -} - TString GetDataCenterName(ui64 dataCenterId) { switch (dataCenterId) { case '\0sas': diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp index 3cf9b5f5e17d..d82a35c6344e 100644 --- a/ydb/core/mind/hive/hive_ut.cpp +++ b/ydb/core/mind/hive/hive_ut.cpp @@ -8947,6 +8947,37 @@ Y_UNIT_TEST_SUITE(THeavyPerfTest) { } } + + Y_UNIT_TEST(TTestTabletMetrics) { + TTestBasicRuntime runtime(2, false); + Setup(runtime, true); + const ui64 hiveTablet = MakeDefaultHiveID(); + CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive); + NTestSuiteTHiveTest::MakeSureTabletIsUp(runtime, hiveTablet, 0); + TActorId senderB = runtime.AllocateEdgeActor(0); + runtime.SendToPipe(hiveTablet, senderB, new NHive::TEvPrivate::TEvGenerateTestData(), 0, GetPipeConfigWithRetries()); + { + TDispatchOptions options; + options.FinalEvents.emplace_back(TEvTablet::TEvCommit::EventType, 2); + runtime.DispatchEvents(options); + } + + auto rand = CreateDeterministicRandomProvider(9000); + for (size_t j = 0; j < 10000; ++j) { + THolder metrics = MakeHolder(); + for (size_t i = 0; i < 1000; ++i) { + NKikimrHive::TTabletMetrics* metric = metrics->Record.AddTabletMetrics(); + metric->SetTabletID(0x10000 + (rand->GenRand() % 1'000'000)); + metric->MutableResourceUsage()->SetCPU(rand->GenRand() % 1'000'000); + metric->MutableResourceUsage()->SetMemory((rand->GenRand() % 1'000'000)); + } + + runtime.SendToPipe(hiveTablet, senderB, metrics.Release()); + TAutoPtr handle; + runtime.GrabEdgeEventRethrow(handle); + } + + } } Y_UNIT_TEST_SUITE(TStorageBalanceTest) { diff --git a/ydb/core/mind/hive/monitoring.cpp b/ydb/core/mind/hive/monitoring.cpp index 9248554b1027..b95434e1fcc8 100644 --- a/ydb/core/mind/hive/monitoring.cpp +++ b/ydb/core/mind/hive/monitoring.cpp @@ -601,7 +601,7 @@ class TTxMonEvent_Resources : public TTransactionBase { const TTabletInfo& tablet = pr.second; ui64 index = EqualRange(tabletIdIndex.begin(), tabletIdIndex.end(), id).first - tabletIdIndex.begin(); - out << ""; + out << ""; out << "" << id << ""; out << GetResourceValuesHtml(tablet.GetResourceValues()); out << "" << tablet.UsageImpact << ""; @@ -3790,8 +3790,30 @@ class TTxMonEvent_TabletInfo : public TTransactionBase { return result; } - static NJson::TJsonValue MakeFrom(const NKikimrTabletBase::TMetrics& metrics) { - return MakeFrom((const NProtoBuf::Message&)metrics); + static NJson::TJsonValue MakeFrom(const NKikimrTabletBase::TThroughputRecord& record) { + return MakeFrom((const NProtoBuf::Message&)record); + } + + static NJson::TJsonValue MakeFrom(const NKikimrTabletBase::TIopsRecord& record) { + return MakeFrom((const NProtoBuf::Message&)record); + } + + static NJson::TJsonValue MakeFrom(const TMetrics& metrics) { + NJson::TJsonValue result; + result["CPU"] = metrics.CPU; + result["Memory"] = metrics.Memory; + result["Network"] = metrics.Network; + result["Counter"] = metrics.Counter; + result["Storage"] = metrics.Storage; + result["ReadThroughput"] = metrics.ReadThroughput; + result["WriteThroughput"] = metrics.WriteThroughput; + result["ReadIops"] = metrics.ReadIops; + result["WriteIops"] = metrics.WriteIops; + result["GroupReadThroughput"] = MakeFrom(metrics.GroupReadThroughput); + result["GroupWriteThroughput"] = MakeFrom(metrics.GroupWriteThroughput); + result["GroupReadIops"] = MakeFrom(metrics.GroupReadIops); + result["GroupWriteIops"] = MakeFrom(metrics.GroupWriteIops); + return result; } static NJson::TJsonValue MakeFrom(const TTabletMetricsAggregates& aggregates) { diff --git a/ydb/core/mind/hive/node_info.cpp b/ydb/core/mind/hive/node_info.cpp index fb5cb1f29335..f559f6a0a213 100644 --- a/ydb/core/mind/hive/node_info.cpp +++ b/ydb/core/mind/hive/node_info.cpp @@ -68,7 +68,7 @@ bool TNodeInfo::OnTabletChangeVolatileState(TTabletInfo* tablet, TTabletInfo::EV TTabletInfo::EVolatileState oldState = tablet->GetVolatileState(); if (IsResourceDrainingState(oldState)) { if (Tablets[oldState].erase(tablet) != 0) { - UpdateResourceValues(tablet, tablet->GetResourceValues(), NKikimrTabletBase::TMetrics()); + UpdateResourceValues(tablet, tablet->GetResourceValues(), {}); if (!IsResourceDrainingState(newState)) { LastScheduledTablet.reset(); } @@ -89,7 +89,7 @@ bool TNodeInfo::OnTabletChangeVolatileState(TTabletInfo* tablet, TTabletInfo::EV } if (IsResourceDrainingState(newState)) { if (Tablets[newState].insert(tablet).second) { - UpdateResourceValues(tablet, NKikimrTabletBase::TMetrics(), tablet->GetResourceValues()); + UpdateResourceValues(tablet, {}, tablet->GetResourceValues()); if (!IsResourceDrainingState(oldState)) { LastScheduledTablet = {.TabletId = tablet->GetFullTabletId(), .UsageBefore = NodeTotalUsage}; } @@ -109,7 +109,7 @@ bool TNodeInfo::OnTabletChangeVolatileState(TTabletInfo* tablet, TTabletInfo::EV return true; } -void TNodeInfo::UpdateResourceValues(const TTabletInfo* tablet, const NKikimrTabletBase::TMetrics& before, const NKikimrTabletBase::TMetrics& after) { +void TNodeInfo::UpdateResourceValues(const TTabletInfo* tablet, const TMetrics& before, const TMetrics& after) { TResourceRawValues delta = ResourceRawValuesFromMetrics(after) - ResourceRawValuesFromMetrics(before); auto oldResourceValues = ResourceValues; auto oldNormalizedValues = NormalizeRawValues(ResourceValues, ResourceMaximumValues); @@ -425,7 +425,7 @@ void TNodeInfo::UpdateResourceMaximum(const NKikimrTabletBase::TMetrics& metrics std::get(ResourceMaximumValues) = metrics.GetNetwork(); } auto normalizedValues = NormalizeRawValues(ResourceValues, ResourceMaximumValues); - Hive.UpdateTotalResourceValues(nullptr, nullptr, NKikimrTabletBase::TMetrics(), NKikimrTabletBase::TMetrics(), {}, normalizedValues - oldNormalizedValues); + Hive.UpdateTotalResourceValues(nullptr, nullptr, {}, {}, {}, normalizedValues - oldNormalizedValues); } double TNodeInfo::GetNodeUsageForTablet(const TTabletInfo& tablet, bool neighbourPenalty) const { diff --git a/ydb/core/mind/hive/node_info.h b/ydb/core/mind/hive/node_info.h index b2fe835c7c12..5bf35527f693 100644 --- a/ydb/core/mind/hive/node_info.h +++ b/ydb/core/mind/hive/node_info.h @@ -116,7 +116,7 @@ struct TNodeInfo { void ChangeVolatileState(EVolatileState state); bool OnTabletChangeVolatileState(TTabletInfo* tablet, TTabletInfo::EVolatileState newState); - void UpdateResourceValues(const TTabletInfo* tablet, const NKikimrTabletBase::TMetrics& before, const NKikimrTabletBase::TMetrics& after); + void UpdateResourceValues(const TTabletInfo* tablet, const TMetrics& before, const TMetrics& after); ui32 GetTabletsScheduled() const { auto it = Tablets.find(TTabletInfo::EVolatileState::TABLET_VOLATILE_STATE_STARTING); diff --git a/ydb/core/mind/hive/tablet_info.cpp b/ydb/core/mind/hive/tablet_info.cpp index afa722cd40a1..5e3ab1dad083 100644 --- a/ydb/core/mind/hive/tablet_info.cpp +++ b/ydb/core/mind/hive/tablet_info.cpp @@ -357,7 +357,7 @@ void TTabletInfo::UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics } else { ResourceMetricsAggregates.MaximumCPU.AdvanceTime(now); } - ResourceValues.SetCPU(ResourceMetricsAggregates.MaximumCPU.GetValue()); + ResourceValues.CPU = ResourceMetricsAggregates.MaximumCPU.GetValue(); } if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Memory)) { if (metrics.HasMemory()) { @@ -369,7 +369,7 @@ void TTabletInfo::UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics } else { ResourceMetricsAggregates.MaximumMemory.AdvanceTime(now); } - ResourceValues.SetMemory(ResourceMetricsAggregates.MaximumMemory.GetValue()); + ResourceValues.Memory = ResourceMetricsAggregates.MaximumMemory.GetValue(); } if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Network)) { if (metrics.HasNetwork()) { @@ -381,32 +381,28 @@ void TTabletInfo::UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics } else { ResourceMetricsAggregates.MaximumNetwork.AdvanceTime(now); } - ResourceValues.SetNetwork(ResourceMetricsAggregates.MaximumNetwork.GetValue()); + ResourceValues.Network = ResourceMetricsAggregates.MaximumNetwork.GetValue(); } if (metrics.HasStorage()) { - ResourceValues.SetStorage(metrics.GetStorage()); + ResourceValues.Storage = metrics.GetStorage(); } if (metrics.HasReadThroughput()) { - ResourceValues.SetReadThroughput(metrics.GetReadThroughput()); + ResourceValues.ReadThroughput = metrics.GetReadThroughput(); } if (metrics.HasWriteThroughput()) { - ResourceValues.SetWriteThroughput(metrics.GetWriteThroughput()); + ResourceValues.WriteThroughput = metrics.GetWriteThroughput(); } if (metrics.GroupReadThroughputSize() > 0) { - ResourceValues.ClearGroupReadThroughput(); - for (const auto& v : metrics.GetGroupReadThroughput()) { - ResourceValues.AddGroupReadThroughput()->CopyFrom(v); - } + const auto& records = metrics.GetGroupReadThroughput(); + ResourceValues.GroupReadThroughput.assign(records.begin(), records.end()); } if (metrics.GroupWriteThroughputSize() > 0) { - ResourceValues.ClearGroupWriteThroughput(); - for (const auto& v : metrics.GetGroupWriteThroughput()) { - ResourceValues.AddGroupWriteThroughput()->CopyFrom(v); - } + const auto& records = metrics.GetGroupWriteThroughput(); + ResourceValues.GroupWriteThroughput.assign(records.begin(), records.end()); } - i64 counterBefore = ResourceValues.GetCounter(); + i64 counterBefore = ResourceValues.Counter; ActualizeCounter(); - i64 counterAfter = ResourceValues.GetCounter(); + i64 counterAfter = ResourceValues.Counter; const auto& after = ResourceValues; if (Node != nullptr) { if (IsResourceDrainingState(VolatileState)) { @@ -437,26 +433,26 @@ i64 TTabletInfo::GetCounterValue() const { const auto& allowedMetricIds = GetTabletAllowedMetricIds(); if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::CPU) && (ResourceMetricsAggregates.MaximumCPU.GetAllTimeMaximum() > 0 - || ResourceValues.GetCPU() > 0)) { + || ResourceValues.CPU > 0)) { return 0; } if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Memory) && (ResourceMetricsAggregates.MaximumMemory.GetAllTimeMaximum() > 0 - || ResourceValues.GetMemory() > 0)) { + || ResourceValues.Memory > 0)) { return 0; } if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Network) && (ResourceMetricsAggregates.MaximumNetwork.GetAllTimeMaximum() > 0 - || ResourceValues.GetNetwork() > 0)) { + || ResourceValues.Network > 0)) { return 0; } return 1; } void TTabletInfo::FilterRawValues(TResourceRawValues& values) const { - const NKikimrTabletBase::TMetrics& metrics(ResourceValues); + const auto& metrics(ResourceValues); const TVector& allowedMetricIds = GetTabletAllowedMetricIds(); - if (metrics.GetCounter() == 0) { + if (metrics.Counter == 0) { std::get(values) = 0; } if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::CPU) || !THive::IsValidMetricsCPU(metrics)) { @@ -471,9 +467,9 @@ void TTabletInfo::FilterRawValues(TResourceRawValues& values) const { } void TTabletInfo::FilterRawValues(TResourceNormalizedValues& values) const { - const NKikimrTabletBase::TMetrics& metrics(ResourceValues); + const auto& metrics(ResourceValues); const TVector& allowedMetricIds = GetTabletAllowedMetricIds(); - if (metrics.GetCounter() == 0) { + if (metrics.Counter == 0) { std::get(values) = 0; } if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::CPU) || !THive::IsValidMetricsCPU(metrics)) { @@ -488,7 +484,7 @@ void TTabletInfo::FilterRawValues(TResourceNormalizedValues& values) const { } void TTabletInfo::ActualizeCounter() { - ResourceValues.SetCounter(GetCounterValue()); + ResourceValues.Counter = GetCounterValue(); } const TNodeFilter& TTabletInfo::GetNodeFilter() const { diff --git a/ydb/core/mind/hive/tablet_info.h b/ydb/core/mind/hive/tablet_info.h index 050b010f37c1..6925c89a2b77 100644 --- a/ydb/core/mind/hive/tablet_info.h +++ b/ydb/core/mind/hive/tablet_info.h @@ -150,7 +150,7 @@ struct TTabletInfo { TString GetLogPrefix() const; protected: - NKikimrTabletBase::TMetrics ResourceValues; // current values of various metrics + TMetrics ResourceValues; // current values of various metrics TTabletMetricsAggregates ResourceMetricsAggregates; TResourceNormalizedValues ResourceNormalizedValues; @@ -166,6 +166,7 @@ struct TTabletInfo { TNodeFilter NodeFilter; bool InWaitQueue = false; double UsageImpact = 0; + bool UpdateMetricsEnqueued = false; TTabletInfo(ETabletRole role, THive& hive); TTabletInfo(const TTabletInfo&) = delete; @@ -281,7 +282,7 @@ struct TTabletInfo { const TNodeFilter& GetNodeFilter() const; bool InitiateStart(TNodeInfo* node); - const NKikimrTabletBase::TMetrics& GetResourceValues() const { + const TMetrics& GetResourceValues() const { return ResourceValues; } @@ -298,7 +299,7 @@ struct TTabletInfo { } // ONLY for use in unit tests - NKikimrTabletBase::TMetrics& GetMutableResourceValues() { + TMetrics& GetMutableResourceValues() { return ResourceValues; } diff --git a/ydb/core/mind/hive/tx__create_tablet.cpp b/ydb/core/mind/hive/tx__create_tablet.cpp index 8aa71273607d..0c02bb3dc644 100644 --- a/ydb/core/mind/hive/tx__create_tablet.cpp +++ b/ydb/core/mind/hive/tx__create_tablet.cpp @@ -442,14 +442,14 @@ class TTxCreateTablet : public TTransactionBase { NKikimrTabletBase::TMetrics resourceValues; - resourceValues.CopyFrom(Self->GetDefaultResourceValuesForTabletType(tablet.Type)); + Self->GetDefaultResourceValuesForTabletType(tablet.Type).ToProto(&resourceValues); BLOG_D("THive::TTxCreateTablet::Execute; Default resources after merge for type " << tablet.Type << ": {" << resourceValues.ShortDebugString() << "}"); if (IsValidObjectId(tablet.ObjectId)) { - resourceValues.MergeFrom(Self->GetDefaultResourceValuesForObject(tablet.ObjectId)); + Self->GetDefaultResourceValuesForObject(tablet.ObjectId).ToProto(&resourceValues); BLOG_D("THive::TTxCreateTablet::Execute; Default resources after merge for object " << tablet.ObjectId << ": {" << resourceValues.ShortDebugString() << "}"); } // TODO: provide Hive with resource profile used by the tablet instead of default one. - resourceValues.MergeFrom(Self->GetDefaultResourceValuesForProfile(tablet.Type, "default")); + Self->GetDefaultResourceValuesForProfile(tablet.Type, "default").ToProto(&resourceValues); BLOG_D("THive::TTxCreateTablet::Execute; Default resources after merge for profile 'default': {" << resourceValues.ShortDebugString() << "}"); if (resourceValues.ByteSize() == 0) { resourceValues.SetStorage(1ULL << 30); // 1 GB diff --git a/ydb/core/mind/hive/tx__process_metrics.cpp b/ydb/core/mind/hive/tx__process_metrics.cpp new file mode 100644 index 000000000000..d40542e63f19 --- /dev/null +++ b/ydb/core/mind/hive/tx__process_metrics.cpp @@ -0,0 +1,52 @@ +#include "hive_impl.h" +#include "hive_log.h" + +namespace NKikimr { +namespace NHive { + +class TTxProcessMetrics : public TTransactionBase { + TSideEffects SideEffects; + + static constexpr size_t MAX_UPDATES_PROCESSED = 200; +public: + TTxProcessMetrics(THive* hive) + : TBase(hive) + {} + + TTxType GetTxType() const override { return NHive::TXTYPE_PROCESS_METRICS; } + + bool Execute(TTransactionContext& txc, const TActorContext&) override { + BLOG_D("TTxProcessMetrics::Execute()"); + NIceDb::TNiceDb db(txc.DB); + SideEffects.Reset(Self->SelfId()); + for (size_t i = 0; !Self->ProcessMetricsQueue.empty() && i < MAX_UPDATES_PROCESSED; ++i) { + auto tabletId = Self->ProcessMetricsQueue.front(); + Self->ProcessMetricsQueue.pop(); + auto* tablet = Self->FindTablet(tabletId); + if (tablet == nullptr) { + continue; + } + tablet->UpdateMetricsEnqueued = false; + NKikimrTabletBase::TMetrics protoMetrics; + tablet->GetResourceValues().ToProto(&protoMetrics); + db.Table().Key(tabletId).Update(protoMetrics); + } + if (Self->ProcessMetricsQueue.empty()) { + Self->ProcessMetricsScheduled = false; + } else { + SideEffects.Send(Self->SelfId(), new TEvPrivate::TEvProcessMetrics); + } + return true; + } + + void Complete(const TActorContext& ctx) override { + SideEffects.Complete(ctx); + } +}; + +ITransaction* THive::CreateProcessMetrics() { + return new TTxProcessMetrics(this); +} + +} // NHive +} // NKikimr diff --git a/ydb/core/mind/hive/tx__seize_tablets.cpp b/ydb/core/mind/hive/tx__seize_tablets.cpp index 20a5947b9d6b..c72d175d62b1 100644 --- a/ydb/core/mind/hive/tx__seize_tablets.cpp +++ b/ydb/core/mind/hive/tx__seize_tablets.cpp @@ -79,7 +79,7 @@ class TTxSeizeTablets : public TTransactionBase { tabletInfo.SetLockedReconnectTimeout(tablet.LockedReconnectTimeout.MilliSeconds()); tabletInfo.SetTabletStorageVersion(tablet.TabletStorageInfo->Version); tabletInfo.SetTabletBootMode(tablet.BootMode); - tabletInfo.MutableResourceUsage()->CopyFrom(tablet.GetResourceValues()); + tablet.GetResourceValues().ToProto(tabletInfo.MutableResourceUsage()); TSubDomainKey objectDomain = TSubDomainKey(tabletRowset.GetValueOrDefault()); tabletInfo.MutableObjectDomain()->CopyFrom(objectDomain); diff --git a/ydb/core/mind/hive/tx__update_tablet_metrics.cpp b/ydb/core/mind/hive/tx__update_tablet_metrics.cpp index b927a718ed0a..d03695e607ae 100644 --- a/ydb/core/mind/hive/tx__update_tablet_metrics.cpp +++ b/ydb/core/mind/hive/tx__update_tablet_metrics.cpp @@ -28,9 +28,6 @@ class TTxUpdateTabletMetrics : public TTransactionBase { TTabletInfo* tablet = Self->FindTablet(tabletId, followerId); if (tablet != nullptr && metrics.HasResourceUsage()) { tablet->UpdateResourceUsage(metrics.GetResourceUsage()); - const NKikimrTabletBase::TMetrics& metrics(tablet->GetResourceValues()); - - db.Table().Key(tabletId, followerId).Update(metrics); db.Table().Key(tabletId, followerId).Update(tablet->GetResourceMetricsAggregates().MaximumCPU); db.Table().Key(tabletId, followerId).Update(tablet->GetResourceMetricsAggregates().MaximumMemory); @@ -38,6 +35,7 @@ class TTxUpdateTabletMetrics : public TTransactionBase { tablet->Statistics.SetLastAliveTimestamp(now.MilliSeconds()); tablet->ActualizeTabletStatistics(now); + Self->EnqueueUpdateMetrics(tablet); if (tablet->IsLeader()) { db.Table() diff --git a/ydb/core/mind/hive/ya.make b/ydb/core/mind/hive/ya.make index ede60b249773..f746bd925c93 100644 --- a/ydb/core/mind/hive/ya.make +++ b/ydb/core/mind/hive/ya.make @@ -61,6 +61,7 @@ SRCS( tx__load_everything.cpp tx__lock_tablet.cpp tx__process_boot_queue.cpp + tx__process_metrics.cpp tx__process_pending_operations.cpp tx__reassign_groups.cpp tx__reassign_groups_on_decommit.cpp diff --git a/ydb/core/protos/counters_hive.proto b/ydb/core/protos/counters_hive.proto index fcba4b389ab7..8870643edc93 100644 --- a/ydb/core/protos/counters_hive.proto +++ b/ydb/core/protos/counters_hive.proto @@ -187,4 +187,5 @@ enum ETxTypes { TXTYPE_CONFIGURE_SCALE_RECOMMENDER = 67 [(TxTypeOpts) = {Name: "TxConfigureScaleRecommender"}]; TXTYPE_UPDATE_PILES = 68 [(TxTypeOpts) = {Name: "TxUpdatePiles"}]; TXTYPE_MON_SET_DOMAIN = 69 [(TxTypeOpts) = {Name: "TxMonSetDomain"}]; + TXTYPE_PROCESS_METRICS = 70 [(TxTypeOpts) = {Name: "TxProcessMetrics"}]; }