Skip to content

Commit 6ef442d

Browse files
stats: make quantile tolerated error configurable
Make metrics quantile collector tolerated error [1] configurable. Change metrics quantile collector default tolerated error from 1e-2 to 1e-3. The motivation of this patch is a tarantool/metrics bug [2]. Sometimes quantile values turn to `-Inf` under high load when observations are small. It was reproduced in process of developing Grafana dashboard panels for CRUD stats [3]. Quantile tolerated error could be changed with crud.cfg: crud.cfg{stats_quantile_tolerated_error = 1e-4} 1. https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary 2. tarantool/metrics#189 3. https://github.com/tarantool/grafana-dashboard/tree/DifferentialOrange/crud-report
1 parent 6a4d4e9 commit 6ef442d

File tree

8 files changed

+121
-18
lines changed

8 files changed

+121
-18
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
88
## [Unreleased]
99

1010
### Added
11+
* Make metrics quantile collector tolerated error configurable.
1112

1213
### Changed
14+
* Change metrics quantile collector default tolerated error
15+
from 1e-2 to 1e-3.
1316

1417
### Fixed
1518
* Requests no more fail with "Sharding hash mismatch" error

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -793,6 +793,13 @@ metrics:collect()
793793
metric_name: tnt_crud_stats
794794
...
795795
```
796+
If you see `-Inf` value in quantile metrics, try to decrease the tolerated error:
797+
```lua
798+
crud.cfg{stats_quantile_tolerated_error = 1e-4}
799+
```
800+
See [tarantool/metrics#189](https://github.com/tarantool/metrics/issues/189) for
801+
details about the issue.
802+
796803

797804
`select` section additionally contains `details` collectors.
798805
```lua

crud/cfg.lua

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ local function set_defaults_if_empty(cfg)
2525
cfg.stats_quantiles = false
2626
end
2727

28+
if cfg.stats_quantile_tolerated_error == nil then
29+
cfg.stats_quantile_tolerated_error = stats.DEFAULT_QUANTILE_TOLERATED_ERROR
30+
end
31+
2832
return cfg
2933
end
3034

@@ -33,7 +37,8 @@ local cfg = set_defaults_if_empty(stash.get(stash.name.cfg))
3337
local function configure_stats(cfg, opts)
3438
if (opts.stats == nil)
3539
and (opts.stats_driver == nil)
36-
and (opts.stats_quantiles == nil) then
40+
and (opts.stats_quantiles == nil)
41+
and (opts.stats_quantile_tolerated_error == nil) then
3742
return
3843
end
3944

@@ -49,15 +54,24 @@ local function configure_stats(cfg, opts)
4954
opts.stats_quantiles = cfg.stats_quantiles
5055
end
5156

57+
if opts.stats_quantiles == nil then
58+
opts.stats_quantile_tolerated_error = cfg.stats_quantile_tolerated_error
59+
end
60+
5261
if opts.stats == true then
53-
stats.enable{ driver = opts.stats_driver, quantiles = opts.stats_quantiles }
62+
stats.enable{
63+
driver = opts.stats_driver,
64+
quantiles = opts.stats_quantiles,
65+
quantile_tolerated_error = opts.stats_quantile_tolerated_error,
66+
}
5467
else
5568
stats.disable()
5669
end
5770

5871
rawset(cfg, 'stats', opts.stats)
5972
rawset(cfg, 'stats_driver', opts.stats_driver)
6073
rawset(cfg, 'stats_quantiles', opts.stats_quantiles)
74+
rawset(cfg, 'stats_quantile_tolerated_error', opts.stats_quantile_tolerated_error)
6175
end
6276

6377
--- Configure CRUD module.
@@ -86,13 +100,21 @@ end
86100
-- Enable or disable statistics quantiles (only for metrics driver).
87101
-- Quantiles computations increases performance overhead up to 10%.
88102
--
103+
-- @number[opt=1e-3] opts.stats_quantile_tolerated_error
104+
-- See tarantool/metrics summary API for details:
105+
-- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
106+
-- If quantile value is -Inf, try to decrease quantile tolerance.
107+
-- See https://github.com/tarantool/metrics/issues/189 for issue details.
108+
-- Decreasing the value increases computational load.
109+
--
89110
-- @return Configuration table.
90111
--
91112
local function __call(self, opts)
92113
checks('table', {
93114
stats = '?boolean',
94115
stats_driver = '?string',
95-
stats_quantiles = '?boolean'
116+
stats_quantiles = '?boolean',
117+
stats_quantile_tolerated_error = '?number',
96118
})
97119

98120
opts = table.deepcopy(opts) or {}

crud/stats/init.lua

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,20 @@ end
8484
-- computing requests latency as 0.99 quantile with aging.
8585
-- Performance overhead for enabling is near 10%.
8686
--
87+
-- @number[opt=1e-3] opts.quantile_tolerated_error
88+
-- See tarantool/metrics summary API for details:
89+
-- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
90+
-- If quantile value is -Inf, try to decrease quantile tolerance.
91+
-- See https://github.com/tarantool/metrics/issues/189 for issue details.
92+
--
8793
-- @treturn boolean Returns `true`.
8894
--
8995
function stats.enable(opts)
90-
checks({ driver = '?string', quantiles = '?boolean' })
96+
checks({
97+
driver = '?string',
98+
quantiles = '?boolean',
99+
quantile_tolerated_error = '?number',
100+
})
91101

92102
StatsError:assert(
93103
rawget(_G, 'crud') ~= nil,
@@ -108,19 +118,29 @@ function stats.enable(opts)
108118
opts.quantiles = false
109119
end
110120

121+
if opts.quantile_tolerated_error == nil then
122+
opts.quantile_tolerated_error = stats.DEFAULT_QUANTILE_TOLERATED_ERROR
123+
end
124+
111125
-- Do not reinit if called with same options.
112126
if internal.driver == opts.driver
113-
and internal.quantiles == opts.quantiles then
127+
and internal.quantiles == opts.quantiles
128+
and internal.quantile_tolerated_error == opts.quantile_tolerated_error then
114129
return true
115130
end
116131

117132
-- Disable old driver registry, if another one was requested.
118133
stats.disable()
119134

120135
internal.driver = opts.driver
121-
internal.quantiles = opts.quantiles
122136

123-
internal:get_registry().init({ quantiles = internal.quantiles })
137+
internal:get_registry().init{
138+
quantiles = opts.quantiles,
139+
quantile_tolerated_error = opts.quantile_tolerated_error
140+
}
141+
142+
internal.quantiles = opts.quantiles
143+
internal.quantile_tolerated_error = opts.quantile_tolerated_error
124144

125145
return true
126146
end
@@ -140,7 +160,10 @@ function stats.reset()
140160
end
141161

142162
internal:get_registry().destroy()
143-
internal:get_registry().init({ quantiles = internal.quantiles })
163+
internal:get_registry().init{
164+
quantiles = internal.quantiles,
165+
quantile_tolerated_error = internal.quantile_tolerated_error
166+
}
144167

145168
return true
146169
end
@@ -469,4 +492,7 @@ stats.op = op_module
469492
-- @tfield[opt] boolean quantiles Is quantiles computed.
470493
stats.internal = internal
471494

495+
--- Default metrics quantile precision.
496+
stats.DEFAULT_QUANTILE_TOLERATED_ERROR = 1e-3
497+
472498
return stats

crud/stats/local_registry.lua

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,16 @@ local StatsLocalError = errors.new_class('StatsLocalError', {capture_stack = fal
2525
-- @bool opts.quantiles
2626
-- Quantiles is not supported for local, only `false` is valid.
2727
--
28+
-- @number opts.quantile_tolerated_error
29+
-- Quantiles is not supported for local, so the value is ignored.
30+
--
2831
-- @treturn boolean Returns `true`.
2932
--
3033
function registry.init(opts)
31-
dev_checks({ quantiles = 'boolean' })
34+
dev_checks({
35+
quantiles = 'boolean',
36+
quantile_tolerated_error = 'number',
37+
})
3238

3339
StatsLocalError:assert(opts.quantiles == false,
3440
"Quantiles are not supported for 'local' statistics registry")

crud/stats/metrics_registry.lua

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,6 @@ local metric_name = {
3131

3232
local LATENCY_QUANTILE = 0.99
3333

34-
-- Increasing tolerance threshold affects performance.
35-
local DEFAULT_QUANTILES = {
36-
[LATENCY_QUANTILE] = 1e-2,
37-
}
38-
3934
local DEFAULT_AGE_PARAMS = {
4035
age_buckets_count = 2,
4136
max_age_time = 60,
@@ -86,17 +81,24 @@ end
8681
-- @bool opts.quantiles
8782
-- If `true`, computes latency as 0.99 quantile with aging.
8883
--
84+
-- @number[opt=1e-3] opts.quantile_tolerated_error
85+
-- See metrics summary API for details:
86+
-- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
87+
-- If quantile value is -Inf, try to decrease quantile tolerance.
88+
-- See https://github.com/tarantool/metrics/issues/189 for issue details.
89+
--
8990
-- @treturn boolean Returns `true`.
9091
--
9192
function registry.init(opts)
92-
dev_checks({ quantiles = 'boolean' })
93-
94-
internal.opts = table.deepcopy(opts)
93+
dev_checks({
94+
quantiles = 'boolean',
95+
quantile_tolerated_error = 'number',
96+
})
9597

9698
local quantile_params = nil
9799
local age_params = nil
98100
if opts.quantiles == true then
99-
quantile_params = DEFAULT_QUANTILES
101+
quantile_params = {[LATENCY_QUANTILE] = opts.quantile_tolerated_error}
100102
age_params = DEFAULT_AGE_PARAMS
101103
end
102104

@@ -119,6 +121,8 @@ function registry.init(opts)
119121
metric_name.details.map_reduces,
120122
'Map reduces planned during CRUD select/pairs')
121123

124+
internal.opts = table.deepcopy(opts)
125+
122126
return true
123127
end
124128

test/integration/cfg_test.lua

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ group.test_defaults = function(g)
2626
stats = false,
2727
stats_driver = stats.get_default_driver(),
2828
stats_quantiles = false,
29+
stats_quantile_tolerated_error = 1e-3,
2930
})
3031
end
3132

test/unit/stats_test.lua

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,40 @@ group_driver.test_default_quantiles = function(g)
658658
end
659659

660660

661+
group_driver.test_default_quantile_tolerated_error = function(g)
662+
enable_stats(g)
663+
664+
local quantile_tolerated_error = g.router:eval(" return stats_module.internal.quantile_tolerated_error ")
665+
t.assert_equals(quantile_tolerated_error, 1e-3)
666+
end
667+
668+
669+
group_driver.before_test(
670+
'test_custom_quantile_tolerated_error',
671+
function(g)
672+
t.skip_if(g.is_metrics_supported == false, 'Metrics registry is unsupported')
673+
end
674+
)
675+
676+
group_driver.test_custom_quantile_tolerated_error = function(g)
677+
g.router:call('crud.cfg', {{
678+
stats = true,
679+
stats_driver = 'metrics',
680+
stats_quantiles = true,
681+
stats_quantile_tolerated_error = 5e-4,
682+
}})
683+
684+
local resp = g.router:eval([[
685+
local metrics = require('metrics')
686+
687+
local summary = metrics.summary('tnt_crud_stats')
688+
return summary.objectives
689+
]])
690+
691+
t.assert_equals(resp, {[0.99] = 5e-4})
692+
end
693+
694+
661695
group_driver.before_test(
662696
'test_stats_reenable_with_different_driver_reset_stats',
663697
function(g)

0 commit comments

Comments
 (0)