Skip to content

Commit 97effe9

Browse files
authored
Retry more errors in block storage querier (#5558)
* retry chunk pool exhaustion error and client conn closing error in block storage querier Signed-off-by: Ben Ye <[email protected]> * changelog Signed-off-by: Ben Ye <[email protected]> * fix lint Signed-off-by: Ben Ye <[email protected]> * remove retry on chunk pool exhaustion error for now Signed-off-by: Ben Ye <[email protected]> * update changelog Signed-off-by: Ben Ye <[email protected]> --------- Signed-off-by: Ben Ye <[email protected]>
1 parent 5ca94a0 commit 97effe9

File tree

3 files changed

+34
-0
lines changed

3 files changed

+34
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
* [ENHANCEMENT] Store Gateway: add metric `cortex_bucket_store_chunk_refetches_total` for number of chunk refetches. #5532
6161
* [ENHANCEMENT] BasicLifeCycler: allow final-sleep during shutdown #5517
6262
* [ENHANCEMENT] All: Handling CMK Access Denied errors. #5420 #5542
63+
* [ENHANCEMENT] Querier: Retry store gateway client connection closing gRPC error. #5558
6364
* [BUGFIX] Ruler: Validate if rule group can be safely converted back to rule group yaml from protobuf message #5265
6465
* [BUGFIX] Querier: Convert gRPC `ResourceExhausted` status code from store gateway to 422 limit error. #5286
6566
* [BUGFIX] Alertmanager: Route web-ui requests to the alertmanager distributor when sharding is enabled. #5293

pkg/querier/blocks_store_queryable.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1118,6 +1118,10 @@ func isRetryableError(err error) bool {
11181118
return true
11191119
case codes.ResourceExhausted:
11201120
return errors.Is(err, storegateway.ErrTooManyInflightRequests)
1121+
// Client side connection closing, this error happens during store gateway deployment.
1122+
// https://github.com/grpc/grpc-go/blob/03172006f5d168fc646d87928d85cb9c4a480291/clientconn.go#L67
1123+
case codes.Canceled:
1124+
return strings.Contains(err.Error(), "grpc: the client connection is closing")
11211125
default:
11221126
return false
11231127
}

pkg/querier/blocks_store_queryable_test.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -639,6 +639,35 @@ func TestBlocksStoreQuerier_Select(t *testing.T) {
639639
},
640640
},
641641
},
642+
"multiple store-gateways has the block, but one of them fails to return due to clientconn closing": {
643+
finderResult: bucketindex.Blocks{
644+
{ID: block1},
645+
},
646+
storeSetResponses: []interface{}{
647+
map[BlocksStoreClient][]ulid.ULID{
648+
&storeGatewayClientMock{
649+
remoteAddr: "1.1.1.1",
650+
mockedSeriesErr: status.Error(codes.Canceled, "grpc: the client connection is closing"),
651+
}: {block1},
652+
},
653+
map[BlocksStoreClient][]ulid.ULID{
654+
&storeGatewayClientMock{remoteAddr: "2.2.2.2", mockedSeriesResponses: []*storepb.SeriesResponse{
655+
mockSeriesResponse(labels.Labels{metricNameLabel, series1Label}, minT, 2),
656+
mockHintsResponse(block1),
657+
}}: {block1},
658+
},
659+
},
660+
limits: &blocksStoreLimitsMock{},
661+
queryLimiter: noOpQueryLimiter,
662+
expectedSeries: []seriesResult{
663+
{
664+
lbls: labels.New(metricNameLabel, series1Label),
665+
values: []valueResult{
666+
{t: minT, v: 2},
667+
},
668+
},
669+
},
670+
},
642671
"all store-gateways return PermissionDenied": {
643672
finderResult: bucketindex.Blocks{
644673
{ID: block1},

0 commit comments

Comments
 (0)