From 687f78bc206ce95b6a073ca0162fc3c3f215736c Mon Sep 17 00:00:00 2001 From: Ben Ye Date: Thu, 17 Oct 2024 12:05:32 -0700 Subject: [PATCH 1/3] expose flag for cortex consistency check Signed-off-by: Ben Ye --- pkg/querier/blocks_store_queryable.go | 55 ++++++++++++---------- pkg/querier/blocks_store_queryable_test.go | 11 ++++- pkg/querier/querier.go | 9 ++++ 3 files changed, 49 insertions(+), 26 deletions(-) diff --git a/pkg/querier/blocks_store_queryable.go b/pkg/querier/blocks_store_queryable.go index 6d09193edc8..9c08b40ee1a 100644 --- a/pkg/querier/blocks_store_queryable.go +++ b/pkg/querier/blocks_store_queryable.go @@ -141,7 +141,8 @@ type BlocksStoreQueryable struct { metrics *blocksStoreQueryableMetrics limits BlocksStoreLimits - storeGatewayQueryStatsEnabled bool + storeGatewayQueryStatsEnabled bool + storeGatewayConsistencyCheckMaxAttempts int // Subservices manager. subservices *services.Manager @@ -153,8 +154,7 @@ func NewBlocksStoreQueryable( finder BlocksFinder, consistency *BlocksConsistencyChecker, limits BlocksStoreLimits, - queryStoreAfter time.Duration, - storeGatewayQueryStatsEnabled bool, + config Config, logger log.Logger, reg prometheus.Registerer, ) (*BlocksStoreQueryable, error) { @@ -164,16 +164,17 @@ func NewBlocksStoreQueryable( } q := &BlocksStoreQueryable{ - stores: stores, - finder: finder, - consistency: consistency, - queryStoreAfter: queryStoreAfter, - logger: logger, - subservices: manager, - subservicesWatcher: services.NewFailureWatcher(), - metrics: newBlocksStoreQueryableMetrics(reg), - limits: limits, - storeGatewayQueryStatsEnabled: storeGatewayQueryStatsEnabled, + stores: stores, + finder: finder, + consistency: consistency, + queryStoreAfter: config.QueryStoreAfter, + logger: logger, + subservices: manager, + subservicesWatcher: services.NewFailureWatcher(), + metrics: newBlocksStoreQueryableMetrics(reg), + limits: limits, + storeGatewayQueryStatsEnabled: config.StoreGatewayQueryStatsEnabled, + storeGatewayConsistencyCheckMaxAttempts: config.StoreGatewayConsistencyCheckMaxAttempts, } q.Service = services.NewBasicService(q.starting, q.running, q.stopping) @@ -264,7 +265,7 @@ func NewBlocksStoreQueryableFromConfig(querierCfg Config, gatewayCfg storegatewa reg, ) - return NewBlocksStoreQueryable(stores, finder, consistency, limits, querierCfg.QueryStoreAfter, querierCfg.StoreGatewayQueryStatsEnabled, logger, reg) + return NewBlocksStoreQueryable(stores, finder, consistency, limits, querierCfg, logger, reg) } func (q *BlocksStoreQueryable) starting(ctx context.Context) error { @@ -299,16 +300,17 @@ func (q *BlocksStoreQueryable) Querier(mint, maxt int64) (storage.Querier, error } return &blocksStoreQuerier{ - minT: mint, - maxT: maxt, - finder: q.finder, - stores: q.stores, - metrics: q.metrics, - limits: q.limits, - consistency: q.consistency, - logger: q.logger, - queryStoreAfter: q.queryStoreAfter, - storeGatewayQueryStatsEnabled: q.storeGatewayQueryStatsEnabled, + minT: mint, + maxT: maxt, + finder: q.finder, + stores: q.stores, + metrics: q.metrics, + limits: q.limits, + consistency: q.consistency, + logger: q.logger, + queryStoreAfter: q.queryStoreAfter, + storeGatewayQueryStatsEnabled: q.storeGatewayQueryStatsEnabled, + storeGatewayConsistencyCheckMaxAttempts: q.storeGatewayConsistencyCheckMaxAttempts, }, nil } @@ -328,6 +330,9 @@ type blocksStoreQuerier struct { // If enabled, query stats of store gateway requests will be logged // using `info` level. storeGatewayQueryStatsEnabled bool + + // The maximum number of times we attempt fetching missing blocks from different Store Gateways. + storeGatewayConsistencyCheckMaxAttempts int } // Select implements storage.Querier interface. @@ -534,7 +539,7 @@ func (q *blocksStoreQuerier) queryWithConsistencyCheck(ctx context.Context, logg retryableError error ) - for attempt := 1; attempt <= maxFetchSeriesAttempts; attempt++ { + for attempt := 1; attempt <= q.storeGatewayConsistencyCheckMaxAttempts; attempt++ { // Find the set of store-gateway instances having the blocks. The exclude parameter is the // map of blocks queried so far, with the list of store-gateway addresses for each block. clients, err := q.stores.GetClientsFor(userID, remainingBlocks, attemptedBlocks, attemptedBlocksZones) diff --git a/pkg/querier/blocks_store_queryable_test.go b/pkg/querier/blocks_store_queryable_test.go index 5cbb2d2020e..390f27107af 100644 --- a/pkg/querier/blocks_store_queryable_test.go +++ b/pkg/querier/blocks_store_queryable_test.go @@ -1552,6 +1552,8 @@ func TestBlocksStoreQuerier_Select(t *testing.T) { logger: log.NewNopLogger(), metrics: newBlocksStoreQueryableMetrics(reg), limits: testData.limits, + + storeGatewayConsistencyCheckMaxAttempts: 3, } matchers := []*labels.Matcher{ @@ -2148,6 +2150,8 @@ func TestBlocksStoreQuerier_Labels(t *testing.T) { logger: log.NewNopLogger(), metrics: newBlocksStoreQueryableMetrics(reg), limits: &blocksStoreLimitsMock{}, + + storeGatewayConsistencyCheckMaxAttempts: 3, } if testFunc == "LabelNames" { @@ -2371,7 +2375,12 @@ func TestBlocksStoreQuerier_PromQLExecution(t *testing.T) { } // Instance the querier that will be executed to run the query. - queryable, err := NewBlocksStoreQueryable(stores, finder, NewBlocksConsistencyChecker(0, 0, logger, nil), &blocksStoreLimitsMock{}, 0, false, logger, nil) + cfg := Config{ + QueryStoreAfter: 0, + StoreGatewayQueryStatsEnabled: false, + StoreGatewayConsistencyCheckMaxAttempts: 3, + } + queryable, err := NewBlocksStoreQueryable(stores, finder, NewBlocksConsistencyChecker(0, 0, logger, nil), &blocksStoreLimitsMock{}, cfg, logger, nil) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(context.Background(), queryable)) defer services.StopAndAwaitTerminated(context.Background(), queryable) // nolint:errcheck diff --git a/pkg/querier/querier.go b/pkg/querier/querier.go index 77a04b20931..34d1f8b19ed 100644 --- a/pkg/querier/querier.go +++ b/pkg/querier/querier.go @@ -79,6 +79,9 @@ type Config struct { StoreGatewayClient ClientConfig `yaml:"store_gateway_client"` StoreGatewayQueryStatsEnabled bool `yaml:"store_gateway_query_stats"` + // The maximum number of times we attempt fetching missing blocks from different Store Gateways. + StoreGatewayConsistencyCheckMaxAttempts int `yaml:"store_gateway_consistency_check_max_attempts"` + ShuffleShardingIngestersLookbackPeriod time.Duration `yaml:"shuffle_sharding_ingesters_lookback_period"` // Experimental. Use https://github.com/thanos-io/promql-engine rather than @@ -94,6 +97,7 @@ var ( errShuffleShardingLookbackLessThanQueryStoreAfter = errors.New("the shuffle-sharding lookback period should be greater or equal than the configured 'query store after'") errEmptyTimeRange = errors.New("empty time range") errUnsupportedResponseCompression = errors.New("unsupported response compression. Supported compression 'gzip' and '' (disable compression)") + errInvalidConsistencyCheckAttempts = errors.New("store gateway consistency check max attempts should be greater or equal than 1") ) // RegisterFlags adds the flags required to config this to the given FlagSet. @@ -122,6 +126,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.StringVar(&cfg.ActiveQueryTrackerDir, "querier.active-query-tracker-dir", "./active-query-tracker", "Active query tracker monitors active queries, and writes them to the file in given directory. If Cortex discovers any queries in this log during startup, it will log them to the log file. Setting to empty value disables active query tracker, which also disables -querier.max-concurrent option.") f.StringVar(&cfg.StoreGatewayAddresses, "querier.store-gateway-addresses", "", "Comma separated list of store-gateway addresses in DNS Service Discovery format. This option should be set when using the blocks storage and the store-gateway sharding is disabled (when enabled, the store-gateway instances form a ring and addresses are picked from the ring).") f.BoolVar(&cfg.StoreGatewayQueryStatsEnabled, "querier.store-gateway-query-stats-enabled", true, "If enabled, store gateway query stats will be logged using `info` log level.") + f.IntVar(&cfg.StoreGatewayConsistencyCheckMaxAttempts, "querier.store-gateway-consistency-check-max-attempts", maxFetchSeriesAttempts, "The maximum number of times we attempt fetching missing blocks from different store-gateways. If no more store-gateways are left (ie. due to lower replication factor) than we'll end the retries earlier") f.DurationVar(&cfg.LookbackDelta, "querier.lookback-delta", 5*time.Minute, "Time since the last sample after which a time series is considered stale and ignored by expression evaluations.") f.DurationVar(&cfg.ShuffleShardingIngestersLookbackPeriod, "querier.shuffle-sharding-ingesters-lookback-period", 0, "When distributor's sharding strategy is shuffle-sharding and this setting is > 0, queriers fetch in-memory series from the minimum set of required ingesters, selecting only ingesters which may have received series since 'now - lookback period'. The lookback period should be greater or equal than the configured 'query store after' and 'query ingesters within'. If this setting is 0, queriers always query all ingesters (ingesters shuffle sharding on read path is disabled).") f.BoolVar(&cfg.ThanosEngine, "querier.thanos-engine", false, "Experimental. Use Thanos promql engine https://github.com/thanos-io/promql-engine rather than the Prometheus promql engine.") @@ -148,6 +153,10 @@ func (cfg *Config) Validate() error { } } + if cfg.StoreGatewayConsistencyCheckMaxAttempts < 1 { + return errInvalidConsistencyCheckAttempts + } + return nil } From dc6a44a32c22c2ea2dd3f1879ec0329f9cc721f9 Mon Sep 17 00:00:00 2001 From: Ben Ye Date: Thu, 17 Oct 2024 12:31:02 -0700 Subject: [PATCH 2/3] update docs Signed-off-by: Ben Ye --- docs/blocks-storage/querier.md | 6 ++++++ docs/configuration/config-file-reference.md | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/docs/blocks-storage/querier.md b/docs/blocks-storage/querier.md index 4d9a0af9e6f..360775fe7e0 100644 --- a/docs/blocks-storage/querier.md +++ b/docs/blocks-storage/querier.md @@ -226,6 +226,12 @@ querier: # CLI flag: -querier.store-gateway-query-stats-enabled [store_gateway_query_stats: | default = true] + # The maximum number of times we attempt fetching missing blocks from + # different store-gateways. If no more store-gateways are left (ie. due to + # lower replication factor) than we'll end the retries earlier + # CLI flag: -querier.store-gateway-consistency-check-max-attempts + [store_gateway_consistency_check_max_attempts: | default = 3] + # When distributor's sharding strategy is shuffle-sharding and this setting is # > 0, queriers fetch in-memory series from the minimum set of required # ingesters, selecting only ingesters which may have received series since diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 2a601053c21..a841909e5cb 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -3872,6 +3872,12 @@ store_gateway_client: # CLI flag: -querier.store-gateway-query-stats-enabled [store_gateway_query_stats: | default = true] +# The maximum number of times we attempt fetching missing blocks from different +# store-gateways. If no more store-gateways are left (ie. due to lower +# replication factor) than we'll end the retries earlier +# CLI flag: -querier.store-gateway-consistency-check-max-attempts +[store_gateway_consistency_check_max_attempts: | default = 3] + # When distributor's sharding strategy is shuffle-sharding and this setting is > # 0, queriers fetch in-memory series from the minimum set of required ingesters, # selecting only ingesters which may have received series since 'now - lookback From f1dc4fd63fb280243de2d6a2eecd78e91f45918d Mon Sep 17 00:00:00 2001 From: Ben Ye Date: Thu, 17 Oct 2024 12:34:34 -0700 Subject: [PATCH 3/3] update changelog Signed-off-by: Ben Ye --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9dceb75c3ce..82f0a9367e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ * [ENHANCEMENT] Ingester: Add matchers to ingester LabelNames() and LabelNamesStream() RPC. #6209 * [ENHANCEMENT] Ingester/Store Gateway Clients: Introduce an experimental HealthCheck handler to quickly fail requests directed to unhealthy targets. #6225 #6257 * [ENHANCEMENT] Upgrade build image and Go version to 1.23.2. #6261 #6262 +* [ENHANCEMENT] Querier/Ruler: Expose `store_gateway_consistency_check_max_attempts` for max retries when querying store gateway in consistency check. #6276 * [BUGFIX] Runtime-config: Handle absolute file paths when working directory is not / #6224 ## 1.18.1 2024-10-14