Skip to content

Commit defc3c3

Browse files
authored
query rejection (#6005)
* query rejection - query rejection configurations are added. It uses QueryAttributes which is used by priority queue - added tests. priority queue - priority queue was changed to include step, agent, dashboard, panel configs. Signed-off-by: Erlan Zholdubai uulu <[email protected]> * doc-generator - add check to eliminate duplicate root blocks in case struct was used several times. query rejection generate docs Signed-off-by: Erlan Zholdubai uulu <[email protected]> * query rejection small fixes. Signed-off-by: Erlan Zholdubai uulu <[email protected]> * query rejection added changelog Signed-off-by: Erlan Zholdubai uulu <[email protected]> * query rejection - update docs Signed-off-by: Erlan Zholdubai uulu <[email protected]> * query rejection - add time range attribute Signed-off-by: Erlan Zholdubai uulu <[email protected]> * query rejection - address comments Signed-off-by: Erlan Zholdubai uulu <[email protected]> * query rejection - add integration test, fix query_step_limit Signed-off-by: Erlan Zholdubai uulu <[email protected]> * query rejection - fix imports and improve doc. Signed-off-by: Erlan Zholdubai uulu <[email protected]> * query rejection - address comments. Signed-off-by: Erlan Zholdubai uulu <[email protected]> * query rejection - remove step limit check for subqueries. Signed-off-by: Erlan Zholdubai uulu <[email protected]> * query rejection - add API type Signed-off-by: Erlan Zholdubai uulu <[email protected]> * query rejection - address comments Signed-off-by: Erlan Zholdubai uulu <[email protected]> --------- Signed-off-by: Erlan Zholdubai uulu <[email protected]>
1 parent 14dc2c6 commit defc3c3

21 files changed

+1412
-392
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* [CHANGE] Ingester: Remove `-querier.query-store-for-labels-enabled` flag. Querying long-term store for labels is always enabled. #5984
66
* [CHANGE] Server: Instrument `cortex_request_duration_seconds` metric with native histogram. If `native-histograms` feature is enabled in monitoring Prometheus then the metric name needs to be updated in your dashboards. #6056
77
* [FEATURE] Ingester: Experimental: Enable native histogram ingestion via `-blocks-storage.tsdb.enable-native-histograms` flag. #5986
8+
* [FEATURE] Query Frontend: Added a query rejection mechanism to block resource-intensive queries. #6005
89
* [ENHANCEMENT] rulers: Add support to persist tokens in rulers. #5987
910
* [ENHANCEMENT] Query Frontend/Querier: Added store gateway postings touched count and touched size in Querier stats and log in Query Frontend. #5892
1011
* [ENHANCEMENT] Query Frontend/Querier: Returns `warnings` on prometheus query responses. #5916

docs/configuration/config-file-reference.md

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3285,6 +3285,20 @@ query_priority:
32853285
# List of priority definitions.
32863286
[priorities: <list of PriorityDef> | default = []]
32873287

3288+
# Configuration for query rejection.
3289+
query_rejection:
3290+
# Whether query rejection is enabled.
3291+
# CLI flag: -frontend.query-rejection.enabled
3292+
[enabled: <boolean> | default = false]
3293+
3294+
# List of query_attributes to match and reject queries. A query is rejected if
3295+
# it matches any query_attribute in this list. Each query_attribute has
3296+
# several properties (e.g., regex, time_window, user_agent), and all specified
3297+
# properties must match for a query_attribute to be considered a match. Only
3298+
# the specified properties are checked, and an AND operator is applied to
3299+
# them.
3300+
[query_attributes: <list of QueryAttribute> | default = []]
3301+
32883302
# Duration to delay the evaluation of rules to ensure the underlying metrics
32893303
# have been pushed to Cortex.
32903304
# CLI flag: -ruler.evaluation-delay-duration
@@ -5345,14 +5359,24 @@ limits:
53455359
# priority level. Value between 0 and 1 will be used as a percentage.
53465360
[reserved_queriers: <float> | default = 0]
53475361
5348-
# List of query attributes to assign the priority.
5362+
# List of query_attributes to match and assign priority to queries. A query is
5363+
# assigned to this priority if it matches any query_attribute in this list. Each
5364+
# query_attribute has several properties (e.g., regex, time_window, user_agent),
5365+
# and all specified properties must match for a query_attribute to be considered
5366+
# a match. Only the specified properties are checked, and an AND operator is
5367+
# applied to them.
53495368
[query_attributes: <list of QueryAttribute> | default = []]
53505369
```
53515370

53525371
### `QueryAttribute`
53535372

53545373
```yaml
5355-
# Regex that the query string should match. If not set, it won't be checked.
5374+
# API type for the query. Should be one of the query, query_range, series,
5375+
# labels, label_values. If not set, it won't be checked.
5376+
[api_type: <string> | default = ""]
5377+
5378+
# Regex that the query string (or at least one of the matchers in metadata
5379+
# query) should match. If not set, it won't be checked.
53565380
[regex: <string> | default = ""]
53575381
53585382
# Overall data select time window (including range selectors, modifiers and
@@ -5368,6 +5392,49 @@ time_window:
53685392
# lookback delta) that the query should be within. If set to 0, it won't be
53695393
# checked.
53705394
[end: <int> | default = 0]
5395+
5396+
# Query time range should be within this limit to match. Depending on where it
5397+
# was used, in most of the use-cases, either min or max value will be used. If
5398+
# not set, it won't be checked.
5399+
time_range_limit:
5400+
# This will be duration (12h, 1d, 15d etc.). Query time range should be above
5401+
# or equal to this value to match. Ex: if this value is 20d, then queries
5402+
# whose range is bigger than or equal to 20d will match. If set to 0, it won't
5403+
# be checked.
5404+
[min: <int> | default = 0]
5405+
5406+
# This will be duration (12h, 1d, 15d etc.). Query time range should be below
5407+
# or equal to this value to match. Ex: if this value is 24h, then queries
5408+
# whose range is smaller than or equal to 24h will match.If set to 0, it won't
5409+
# be checked.
5410+
[max: <int> | default = 0]
5411+
5412+
# If query step provided should be within this limit to match. If not set, it
5413+
# won't be checked. This property only applied to range queries and ignored for
5414+
# other types of queries.
5415+
query_step_limit:
5416+
# Query step should be above or equal to this value to match. If set to 0, it
5417+
# won't be checked.
5418+
[min: <int> | default = 0]
5419+
5420+
# Query step should be below or equal to this value to match. If set to 0, it
5421+
# won't be checked.
5422+
[max: <int> | default = 0]
5423+
5424+
# Regex that User-Agent header of the request should match. If not set, it won't
5425+
# be checked.
5426+
[user_agent_regex: <string> | default = ""]
5427+
5428+
# Grafana includes X-Dashboard-Uid header in query requests. If this field is
5429+
# provided then X-Dashboard-Uid header of request should match this value. If
5430+
# not set, it won't be checked. This property won't be applied to metadata
5431+
# queries.
5432+
[dashboard_uid: <string> | default = ""]
5433+
5434+
# Grafana includes X-Panel-Id header in query requests. If this field is
5435+
# provided then X-Panel-Id header of request should match this value. If not
5436+
# set, it won't be checked. This property won't be applied to metadata queries.
5437+
[panel_id: <string> | default = ""]
53715438
```
53725439

53735440
### `DisabledRuleGroup`

docs/configuration/v1-guarantees.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,3 +115,4 @@ Currently experimental features are:
115115
- Ingestion can be enabled by setting `-blocks-storage.tsdb.enable-native-histograms=true` on Ingester.
116116
- String interning for metrics labels
117117
- Enable string interning for metrics labels by setting `-ingester.labels-string-interning-enabled` on Ingester.
118+
- Query-frontend: query rejection (`-frontend.query-rejection.enabled`)

integration/e2ecortex/client.go

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ func (c *Client) QueryRange(query string, start, end time.Time, step time.Durati
234234
}
235235

236236
// QueryRangeRaw runs a ranged query directly against the querier API.
237-
func (c *Client) QueryRangeRaw(query string, start, end time.Time, step time.Duration) (*http.Response, []byte, error) {
237+
func (c *Client) QueryRangeRaw(query string, start, end time.Time, step time.Duration, headers map[string]string) (*http.Response, []byte, error) {
238238
addr := fmt.Sprintf(
239239
"http://%s/api/prom/api/v1/query_range?query=%s&start=%s&end=%s&step=%s",
240240
c.querierAddress,
@@ -244,11 +244,11 @@ func (c *Client) QueryRangeRaw(query string, start, end time.Time, step time.Dur
244244
strconv.FormatFloat(step.Seconds(), 'f', -1, 64),
245245
)
246246

247-
return c.query(addr)
247+
return c.query(addr, headers)
248248
}
249249

250250
// QueryRaw runs a query directly against the querier API.
251-
func (c *Client) QueryRaw(query string, ts time.Time) (*http.Response, []byte, error) {
251+
func (c *Client) QueryRaw(query string, ts time.Time, headers map[string]string) (*http.Response, []byte, error) {
252252
u := &url.URL{
253253
Scheme: "http",
254254
Path: fmt.Sprintf("%s/api/prom/api/v1/query", c.querierAddress),
@@ -260,11 +260,11 @@ func (c *Client) QueryRaw(query string, ts time.Time) (*http.Response, []byte, e
260260
q.Set("time", FormatTime(ts))
261261
}
262262
u.RawQuery = q.Encode()
263-
return c.query(u.String())
263+
return c.query(u.String(), headers)
264264
}
265265

266266
// SeriesRaw runs a series request directly against the querier API.
267-
func (c *Client) SeriesRaw(matches []string, startTime, endTime time.Time) (*http.Response, []byte, error) {
267+
func (c *Client) SeriesRaw(matches []string, startTime, endTime time.Time, headers map[string]string) (*http.Response, []byte, error) {
268268
u := &url.URL{
269269
Scheme: "http",
270270
Path: fmt.Sprintf("%s/api/prom/api/v1/series", c.querierAddress),
@@ -283,11 +283,11 @@ func (c *Client) SeriesRaw(matches []string, startTime, endTime time.Time) (*htt
283283
}
284284

285285
u.RawQuery = q.Encode()
286-
return c.query(u.String())
286+
return c.query(u.String(), headers)
287287
}
288288

289289
// LabelNamesRaw runs a label names request directly against the querier API.
290-
func (c *Client) LabelNamesRaw(matches []string, startTime, endTime time.Time) (*http.Response, []byte, error) {
290+
func (c *Client) LabelNamesRaw(matches []string, startTime, endTime time.Time, headers map[string]string) (*http.Response, []byte, error) {
291291
u := &url.URL{
292292
Scheme: "http",
293293
Path: fmt.Sprintf("%s/api/prom/api/v1/labels", c.querierAddress),
@@ -306,11 +306,11 @@ func (c *Client) LabelNamesRaw(matches []string, startTime, endTime time.Time) (
306306
}
307307

308308
u.RawQuery = q.Encode()
309-
return c.query(u.String())
309+
return c.query(u.String(), headers)
310310
}
311311

312312
// LabelValuesRaw runs a label values request directly against the querier API.
313-
func (c *Client) LabelValuesRaw(label string, matches []string, startTime, endTime time.Time) (*http.Response, []byte, error) {
313+
func (c *Client) LabelValuesRaw(label string, matches []string, startTime, endTime time.Time, headers map[string]string) (*http.Response, []byte, error) {
314314
u := &url.URL{
315315
Scheme: "http",
316316
Path: fmt.Sprintf("%s/api/prom/api/v1/label/%s/values", c.querierAddress, label),
@@ -329,7 +329,7 @@ func (c *Client) LabelValuesRaw(label string, matches []string, startTime, endTi
329329
}
330330

331331
u.RawQuery = q.Encode()
332-
return c.query(u.String())
332+
return c.query(u.String(), headers)
333333
}
334334

335335
// RemoteRead runs a remote read query.
@@ -398,7 +398,7 @@ func (c *Client) RemoteRead(matchers []*labels.Matcher, start, end time.Time, st
398398
return &resp, nil
399399
}
400400

401-
func (c *Client) query(addr string) (*http.Response, []byte, error) {
401+
func (c *Client) query(addr string, headers map[string]string) (*http.Response, []byte, error) {
402402
ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
403403
defer cancel()
404404

@@ -409,6 +409,10 @@ func (c *Client) query(addr string) (*http.Response, []byte, error) {
409409

410410
req.Header.Set("X-Scope-OrgID", c.orgID)
411411

412+
for key, value := range headers {
413+
req.Header.Set(key, value)
414+
}
415+
412416
retries := backoff.New(ctx, backoff.Config{
413417
MinBackoff: 1 * time.Second,
414418
MaxBackoff: 2 * time.Second,

integration/querier_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -901,22 +901,22 @@ func TestQuerierWithBlocksStorageLimits(t *testing.T) {
901901
require.NoError(t, err)
902902

903903
// We expect all queries hitting 422 exceeded series limit on store gateway.
904-
resp, body, err := c.QueryRangeRaw(`{job="test"}`, seriesTimestamp.Add(-time.Second), seriesTimestamp, time.Second)
904+
resp, body, err := c.QueryRangeRaw(`{job="test"}`, seriesTimestamp.Add(-time.Second), seriesTimestamp, time.Second, map[string]string{})
905905
require.NoError(t, err)
906906
require.Equal(t, http.StatusUnprocessableEntity, resp.StatusCode)
907907
require.Contains(t, string(body), "exceeded series limit")
908908

909-
resp, body, err = c.SeriesRaw([]string{`{job="test"}`}, seriesTimestamp.Add(-time.Second), seriesTimestamp)
909+
resp, body, err = c.SeriesRaw([]string{`{job="test"}`}, seriesTimestamp.Add(-time.Second), seriesTimestamp, map[string]string{})
910910
require.NoError(t, err)
911911
require.Equal(t, http.StatusUnprocessableEntity, resp.StatusCode)
912912
require.Contains(t, string(body), "exceeded series limit")
913913

914-
resp, body, err = c.LabelNamesRaw([]string{`{job="test"}`}, seriesTimestamp.Add(-time.Second), seriesTimestamp)
914+
resp, body, err = c.LabelNamesRaw([]string{`{job="test"}`}, seriesTimestamp.Add(-time.Second), seriesTimestamp, map[string]string{})
915915
require.NoError(t, err)
916916
require.Equal(t, http.StatusUnprocessableEntity, resp.StatusCode)
917917
require.Contains(t, string(body), "exceeded series limit")
918918

919-
resp, body, err = c.LabelValuesRaw("job", []string{`{job="test"}`}, seriesTimestamp.Add(-time.Second), seriesTimestamp)
919+
resp, body, err = c.LabelValuesRaw("job", []string{`{job="test"}`}, seriesTimestamp.Add(-time.Second), seriesTimestamp, map[string]string{})
920920
require.NoError(t, err)
921921
require.Equal(t, http.StatusUnprocessableEntity, resp.StatusCode)
922922
require.Contains(t, string(body), "exceeded series limit")
@@ -994,7 +994,7 @@ func TestQuerierWithStoreGatewayDataBytesLimits(t *testing.T) {
994994
require.NoError(t, err)
995995

996996
// We expect all queries hitting 422 exceeded series limit
997-
resp, body, err := c.QueryRaw(`{job="test"}`, series2Timestamp)
997+
resp, body, err := c.QueryRaw(`{job="test"}`, series2Timestamp, map[string]string{})
998998
require.NoError(t, err)
999999
require.Equal(t, http.StatusUnprocessableEntity, resp.StatusCode)
10001000
require.Contains(t, string(body), "exceeded bytes limit")
@@ -1245,7 +1245,7 @@ func TestQuerierMaxSamplesLimit(t *testing.T) {
12451245
var body []byte
12461246
for retries.Ongoing() {
12471247
// We expect request to hit max samples limit.
1248-
res, body, err = c.QueryRaw(`sum({job="test"})`, series1Timestamp)
1248+
res, body, err = c.QueryRaw(`sum({job="test"})`, series1Timestamp, map[string]string{})
12491249
if err == nil {
12501250
break
12511251
}

0 commit comments

Comments
 (0)