Skip to content

Commit 484820a

Browse files
authored
[WRR] backport #33694 to 1.56 (#33698)
1 parent 6425ed2 commit 484820a

File tree

4 files changed

+128
-5
lines changed

4 files changed

+128
-5
lines changed

src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ class OutlierDetectionLb : public LoadBalancingPolicy {
299299
}
300300

301301
void DisableEjection() {
302-
Uneject();
302+
if (ejection_time_.has_value()) Uneject();
303303
multiplier_ = 0;
304304
}
305305

src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -909,12 +909,21 @@ void WeightedRoundRobin::WeightedRoundRobinSubchannelData::
909909
subchannel()->RequestConnection();
910910
} else if (new_state == GRPC_CHANNEL_READY) {
911911
// If we transition back to READY state, restart the blackout period.
912+
// Skip this if this is the initial notification for this
913+
// subchannel (which happens whenever we get updated addresses and
914+
// create a new endpoint list). Also skip it if the previous state
915+
// was READY (which should never happen in practice, but we've seen
916+
// at least one bug that caused this in the outlier_detection
917+
// policy, so let's be defensive here).
918+
//
912919
// Note that we cannot guarantee that we will never receive
913920
// lingering callbacks for backend metric reports from the previous
914921
// connection after the new connection has been established, but they
915922
// should be masked by new backend metric reports from the new
916923
// connection by the time the blackout period ends.
917-
weight_->ResetNonEmptySince();
924+
if (old_state.has_value() && old_state != GRPC_CHANNEL_READY) {
925+
weight_->ResetNonEmptySince();
926+
}
918927
}
919928
// Update logical connectivity state.
920929
UpdateLogicalConnectivityStateLocked(new_state);

test/core/client_channel/lb_policy/weighted_round_robin_test.cc

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,7 @@ class WeightedRoundRobinTest : public TimeAwareLoadBalancingPolicyTest {
252252
backend_metrics,
253253
std::map<absl::string_view /*address*/, size_t /*num_picks*/> expected,
254254
absl::Duration timeout = absl::Seconds(5),
255+
bool run_timer_callbacks = true,
255256
SourceLocation location = SourceLocation()) {
256257
gpr_log(GPR_INFO, "==> WaitForWeightedRoundRobinPicks(): Expecting %s",
257258
PickMapString(expected).c_str());
@@ -308,7 +309,7 @@ class WeightedRoundRobinTest : public TimeAwareLoadBalancingPolicyTest {
308309
EXPECT_NE(*picker, nullptr)
309310
<< location.file() << ":" << location.line();
310311
if (*picker == nullptr) return false;
311-
} else {
312+
} else if (run_timer_callbacks) {
312313
gpr_log(GPR_INFO, "running timer callback...");
313314
RunTimerCallback();
314315
}
@@ -803,6 +804,45 @@ TEST_F(WeightedRoundRobinTest, BlackoutPeriodAfterDisconnect) {
803804
{{kAddresses[0], 1}, {kAddresses[1], 3}, {kAddresses[2], 3}});
804805
}
805806

807+
TEST_F(WeightedRoundRobinTest, BlackoutPeriodDoesNotGetResetAfterUpdate) {
808+
// Send address list to LB policy.
809+
const std::array<absl::string_view, 3> kAddresses = {
810+
"ipv4:127.0.0.1:441", "ipv4:127.0.0.1:442", "ipv4:127.0.0.1:443"};
811+
auto config_builder =
812+
ConfigBuilder().SetWeightExpirationPeriod(Duration::Seconds(2));
813+
auto picker =
814+
SendInitialUpdateAndWaitForConnected(kAddresses, config_builder);
815+
ASSERT_NE(picker, nullptr);
816+
// All backends report weights.
817+
WaitForWeightedRoundRobinPicks(
818+
&picker,
819+
{{kAddresses[0], MakeBackendMetricData(/*app_utilization=*/0.9,
820+
/*qps=*/100.0, /*eps=*/0.0)},
821+
{kAddresses[1], MakeBackendMetricData(/*app_utilization=*/0.3,
822+
/*qps=*/100.0, /*eps=*/0.0)},
823+
{kAddresses[2], MakeBackendMetricData(/*app_utilization=*/0.3,
824+
/*qps=*/100.0, /*eps=*/0.0)}},
825+
{{kAddresses[0], 1}, {kAddresses[1], 3}, {kAddresses[2], 3}});
826+
// Send a duplicate update with the same addresses and config.
827+
EXPECT_EQ(ApplyUpdate(BuildUpdate(kAddresses, config_builder.Build()),
828+
lb_policy_.get()),
829+
absl::OkStatus());
830+
// Note that we have not advanced time, so if the update incorrectly
831+
// triggers resetting the blackout period, none of the weights will
832+
// actually be used.
833+
picker = ExpectState(GRPC_CHANNEL_READY, absl::OkStatus());
834+
WaitForWeightedRoundRobinPicks(
835+
&picker,
836+
{{kAddresses[0], MakeBackendMetricData(/*app_utilization=*/0.9,
837+
/*qps=*/100.0, /*eps=*/0.0)},
838+
{kAddresses[1], MakeBackendMetricData(/*app_utilization=*/0.3,
839+
/*qps=*/100.0, /*eps=*/0.0)},
840+
{kAddresses[2], MakeBackendMetricData(/*app_utilization=*/0.3,
841+
/*qps=*/100.0, /*eps=*/0.0)}},
842+
{{kAddresses[0], 1}, {kAddresses[1], 3}, {kAddresses[2], 3}},
843+
/*timeout=*/absl::Seconds(5), /*run_timer_callbacks=*/false);
844+
}
845+
806846
TEST_F(WeightedRoundRobinTest, ZeroErrorUtilPenalty) {
807847
// Send address list to LB policy.
808848
const std::array<absl::string_view, 3> kAddresses = {

test/cpp/end2end/client_lb_end2end_test.cc

Lines changed: 76 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3174,13 +3174,27 @@ const char kServiceConfigOob[] =
31743174
" ]\n"
31753175
"}";
31763176

3177+
const char kServiceConfigWithOutlierDetection[] =
3178+
"{\n"
3179+
" \"loadBalancingConfig\": [\n"
3180+
" {\"outlier_detection_experimental\": {\n"
3181+
" \"childPolicy\": [\n"
3182+
" {\"weighted_round_robin\": {\n"
3183+
" \"blackoutPeriod\": \"%ds\",\n"
3184+
" \"weightUpdatePeriod\": \"0.1s\"\n"
3185+
" }}\n"
3186+
" ]\n"
3187+
" }}\n"
3188+
" ]\n"
3189+
"}";
3190+
31773191
class WeightedRoundRobinTest : public ClientLbEnd2endTest {
31783192
protected:
31793193
void ExpectWeightedRoundRobinPicks(
31803194
const grpc_core::DebugLocation& location,
31813195
const std::unique_ptr<grpc::testing::EchoTestService::Stub>& stub,
31823196
const std::vector<size_t>& expected_weights, size_t total_passes = 3,
3183-
EchoRequest* request_ptr = nullptr) {
3197+
EchoRequest* request_ptr = nullptr, int timeout_ms = 15000) {
31843198
GPR_ASSERT(expected_weights.size() == servers_.size());
31853199
size_t total_picks_per_pass = 0;
31863200
for (size_t picks : expected_weights) {
@@ -3210,7 +3224,7 @@ class WeightedRoundRobinTest : public ClientLbEnd2endTest {
32103224
}
32113225
return true;
32123226
},
3213-
request_ptr);
3227+
request_ptr, timeout_ms);
32143228
}
32153229
};
32163230

@@ -3254,6 +3268,66 @@ TEST_F(WeightedRoundRobinTest, CallAndServerMetric) {
32543268
EXPECT_EQ("weighted_round_robin", channel->GetLoadBalancingPolicyName());
32553269
}
32563270

3271+
// This tests a bug seen in production where the outlier_detection
3272+
// policy would incorrectly generate a duplicate READY notification on
3273+
// all of its subchannels every time it saw an update, thus causing the
3274+
// WRR policy to re-enter the blackout period for that address.
3275+
TEST_F(WeightedRoundRobinTest, WithOutlierDetection) {
3276+
const int kBlackoutPeriodSeconds = 5;
3277+
const int kNumServers = 3;
3278+
StartServers(kNumServers);
3279+
// Report server metrics that should give 6:4:3 WRR picks.
3280+
// weights = qps / (util + (eps/qps)) =
3281+
// 1/(0.2+0.2) : 1/(0.3+0.3) : 2/(1.5+0.1) = 6:4:3
3282+
// where util is app_util if set, or cpu_util.
3283+
servers_[0]->server_metric_recorder_->SetApplicationUtilization(0.2);
3284+
servers_[0]->server_metric_recorder_->SetEps(20);
3285+
servers_[0]->server_metric_recorder_->SetQps(100);
3286+
servers_[1]->server_metric_recorder_->SetApplicationUtilization(0.3);
3287+
servers_[1]->server_metric_recorder_->SetEps(30);
3288+
servers_[1]->server_metric_recorder_->SetQps(100);
3289+
servers_[2]->server_metric_recorder_->SetApplicationUtilization(1.5);
3290+
servers_[2]->server_metric_recorder_->SetEps(20);
3291+
servers_[2]->server_metric_recorder_->SetQps(200);
3292+
// Create channel.
3293+
// Initial blackout period is 0, so that we start seeing traffic in
3294+
// the right proportions right away.
3295+
auto response_generator = BuildResolverResponseGenerator();
3296+
auto channel = BuildChannel("", response_generator);
3297+
auto stub = BuildStub(channel);
3298+
response_generator.SetNextResolution(
3299+
GetServersPorts(),
3300+
absl::StrFormat(kServiceConfigWithOutlierDetection, 0).c_str());
3301+
// Send requests with per-call reported EPS/QPS set to 0/100.
3302+
// This should give 1/2:1/3:1/15 = 15:10:2 WRR picks.
3303+
// Keep sending RPCs long enough to go past the new blackout period
3304+
// that we're going to add later.
3305+
absl::Time deadline =
3306+
absl::Now() +
3307+
absl::Seconds(kBlackoutPeriodSeconds * grpc_test_slowdown_factor());
3308+
EchoRequest request;
3309+
// We cannot override with 0 with proto3, so setting it to almost 0.
3310+
request.mutable_param()->mutable_backend_metrics()->set_eps(
3311+
std::numeric_limits<double>::min());
3312+
request.mutable_param()->mutable_backend_metrics()->set_rps_fractional(100);
3313+
do {
3314+
ExpectWeightedRoundRobinPicks(DEBUG_LOCATION, stub,
3315+
/*expected_weights=*/{15, 10, 2},
3316+
/*total_passes=*/3, &request);
3317+
} while (absl::Now() < deadline);
3318+
// Send a new resolver response that increases blackout period.
3319+
response_generator.SetNextResolution(
3320+
GetServersPorts(),
3321+
absl::StrFormat(kServiceConfigWithOutlierDetection,
3322+
kBlackoutPeriodSeconds * grpc_test_slowdown_factor())
3323+
.c_str());
3324+
// Weights should be the same before the blackout period expires.
3325+
ExpectWeightedRoundRobinPicks(
3326+
DEBUG_LOCATION, stub, /*expected_weights=*/{15, 10, 2},
3327+
/*total_passes=*/3, &request,
3328+
/*timeout_ms=*/(kBlackoutPeriodSeconds - 1) * 1000);
3329+
}
3330+
32573331
class WeightedRoundRobinParamTest
32583332
: public WeightedRoundRobinTest,
32593333
public ::testing::WithParamInterface<const char*> {};

0 commit comments

Comments
 (0)