Skip to content

Commit ebc730a

Browse files
stats: separate quantile and average fields
Add separate quantile (`latency_quantile_recent`) and average ( `latency_average`) fields to `crud.stats()` output. `latency` field and tarantool/metrics output remains unchanged. Before this patch, `latency` displayed `latency_quantile_recent` or `latency_average` and there wasn't any was to see pre-computed average if quantiles are enabled. But it may be useful if quantile is `nan`. Quantiles may display `-nan` if there were no observations for a several ages. Such behavior is expected [1] and valid: for example, Grafana should ignore such values and they will be displayed as `No data` for a window when there wasn't any requests. 1. tarantool/metrics#303 Closes #286
1 parent 4c9d95e commit ebc730a

File tree

7 files changed

+113
-34
lines changed

7 files changed

+113
-34
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
99

1010
### Added
1111
* Make metrics quantile collector age params configurable (#286).
12+
* Add separate `latency_average` and `latency_quantile_recent`
13+
fields to `crud.stats()` output (#286).
1214

1315
### Changed
1416

README.md

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -721,23 +721,30 @@ crud.stats()
721721
my_space:
722722
insert:
723723
ok:
724-
latency: 0.002
724+
latency: 0.0015
725+
latency_average: 0.002
726+
latency_quantile_recent: 0.0015
725727
count: 19800
726728
time: 39.6
727729
error:
728-
latency: 0.000001
730+
latency: 0.0000008
731+
latency_average: 0.000001
732+
latency_quantile_recent: 0.0000008
729733
count: 4
730734
time: 0.000004
731735
...
732736
crud.stats('my_space')
733737
---
734738
- insert:
735739
ok:
736-
latency: 0.002
740+
latency: 0.0015
741+
latency_average: 0.002
742+
latency_quantile_recent: 0.0015
737743
count: 19800
738744
time: 39.6
739745
error:
740-
latency: 0.000001
746+
average: 0.000001
747+
latency: 0.0000008
741748
count: 4
742749
time: 0.000004
743750
...
@@ -759,10 +766,17 @@ and `borders` (for `min` and `max` calls).
759766
Each operation section consists of different collectors
760767
for success calls and error (both error throw and `nil, err`)
761768
returns. `count` is the total requests count since instance start
762-
or stats restart. `latency` is the 0.99 quantile of request execution
763-
time if `metrics` driver used and quantiles enabled,
764-
otherwise `latency` is the total average.
765-
`time` is the total time of requests execution.
769+
or stats restart. `time` is the total time of requests execution.
770+
`latency_average` is `time` / `count`.
771+
`latency_quantile_recent` is the 0.99 quantile of request execution
772+
time for a recent period (see
773+
[`metrics` summary API](https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary)).
774+
It is computed only if `metrics` driver used and quantiles enabled.
775+
`latency_quantile_recent` value may be `-nan`, if there wasn't any
776+
observations for several ages, see
777+
[tarantool/metrics#303](https://github.com/tarantool/metrics/issues/303).
778+
`latency` is a `latency_quantile_recent`, if it presents,
779+
otherwise it's `latency_average`.
766780

767781
In [`metrics`](https://www.tarantool.io/en/doc/latest/book/monitoring/)
768782
registry statistics are stored as `tnt_crud_stats` metrics

crud/stats/local_registry.lua

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,8 @@ function registry.observe(latency, space_name, op, status)
121121

122122
collectors.count = collectors.count + 1
123123
collectors.time = collectors.time + latency
124-
collectors.latency = collectors.time / collectors.count
124+
collectors.latency_average = collectors.time / collectors.count
125+
collectors.latency = collectors.latency_average
125126

126127
return true
127128
end

crud/stats/metrics_registry.lua

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -162,13 +162,9 @@ function registry.destroy()
162162
return true
163163
end
164164

165-
--- Compute `latency` field of an observation.
165+
--- Compute overall average time of an observation.
166166
--
167-
-- If it is a `{ time = ..., count = ... }` observation,
168-
-- compute latency as overall average and store it
169-
-- inside observation object.
170-
--
171-
-- @function compute_obs_latency
167+
-- @function compute_obs_average
172168
-- @local
173169
--
174170
-- @tab obs
@@ -177,35 +173,41 @@ end
177173
-- If something like `details` collector
178174
-- passed, do nothing.
179175
--
180-
local function compute_obs_latency(obs)
176+
-- @treturn Average value, if possible. 0 otherwise.
177+
--
178+
local function compute_obs_average(obs)
181179
if obs.count == nil or obs.time == nil then
182-
return
180+
return 0
183181
end
184182

185183
if obs.count == 0 then
186-
obs.latency = 0
184+
return 0
187185
else
188-
obs.latency = obs.time / obs.count
186+
return obs.time / obs.count
189187
end
190188
end
191189

192-
--- Compute `latency` field of each observation.
190+
--- Compute `latency_average` and set `latency` fields of each observation.
193191
--
194-
-- If quantiles disabled, we need to compute
195-
-- latency as overall average from `time` and
196-
-- `count` values.
192+
-- `latency` is `latency_average` if quantiles disabled
193+
-- and `latency_quantile` otherwise.
197194
--
198-
-- @function compute_latencies
195+
-- @function compute_aggregates
199196
-- @local
200197
--
201198
-- @tab stats
202199
-- Object from registry_utils stats.
203200
--
204-
local function compute_latencies(stats)
201+
local function compute_aggregates(stats)
205202
for _, space_stats in pairs(stats.spaces) do
206203
for _, op_stats in pairs(space_stats) do
207204
for _, obs in pairs(op_stats) do
208-
compute_obs_latency(obs)
205+
obs.latency_average = compute_obs_average(obs)
206+
if obs.latency_quantile_recent ~= nil then
207+
obs.latency = obs.latency_quantile_recent
208+
else
209+
obs.latency = obs.latency_average
210+
end
209211
end
210212
end
211213
end
@@ -250,7 +252,7 @@ function registry.get(space_name)
250252
-- metric_name.stats presents only if quantiles enabled.
251253
if obs.metric_name == metric_name.stats then
252254
if obs.label_pairs.quantile == LATENCY_QUANTILE then
253-
space_stats[op][status].latency = obs.value
255+
space_stats[op][status].latency_quantile_recent = obs.value
254256
end
255257
elseif obs.metric_name == metric_name.stats_sum then
256258
space_stats[op][status].time = obs.value
@@ -261,9 +263,7 @@ function registry.get(space_name)
261263
:: stats_continue ::
262264
end
263265

264-
if not internal.opts.quantiles then
265-
compute_latencies(stats)
266-
end
266+
compute_aggregates(stats)
267267

268268
-- Fill select/pairs detail statistics values.
269269
for stat_name, metric_name in pairs(metric_name.details) do

crud/stats/registry_utils.lua

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ local registry_utils = {}
1616
-- Use `require('crud.stats').op` to pick one.
1717
--
1818
-- @treturn table Returns collectors for success and error requests.
19-
-- Collectors store 'count', 'latency' and 'time' values. Also
19+
-- Collectors store 'count', 'latency', 'latency_average',
20+
-- 'latency_quantile_recent' and 'time' values. Also
2021
-- returns additional collectors for select operation.
2122
--
2223
function registry_utils.build_collectors(op)
@@ -26,11 +27,17 @@ function registry_utils.build_collectors(op)
2627
ok = {
2728
count = 0,
2829
latency = 0,
30+
latency_average = 0,
31+
--set up only if driver is 'metrics' and quantiles enabled.
32+
latency_quantile_recent = nil,
2933
time = 0,
3034
},
3135
error = {
3236
count = 0,
3337
latency = 0,
38+
latency_average = 0,
39+
--set up only if driver is 'metrics' and quantiles enabled.
40+
latency_quantile_recent = nil,
3441
time = 0,
3542
},
3643
}

test/integration/stats_test.lua

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,10 @@ local function enable_stats(g, params)
5555
require('crud').cfg{
5656
stats = true,
5757
stats_driver = params.driver,
58-
stats_quantiles = params.quantiles
58+
stats_quantiles = params.quantiles,
59+
stats_quantile_tolerated_error = 1e-3,
60+
stats_quantile_age_buckets_count = 3,
61+
stats_quantile_max_age_time = 60,
5962
}
6063
]], { params })
6164
end
@@ -526,6 +529,24 @@ for name, case in pairs(simple_operation_cases) do
526529
t.assert_le(changed_after.latency, ok_latency_max,
527530
'Changed latency has appropriate value')
528531

532+
local ok_average_max = math.max(changed_before.latency_average, after_finish - before_start)
533+
534+
t.assert_gt(changed_after.latency_average, 0,
535+
'Changed average has appropriate value')
536+
t.assert_le(changed_after.latency_average, ok_average_max,
537+
'Changed average has appropriate value')
538+
539+
if g.params.quantiles == true then
540+
local ok_quantile_max = math.max(
541+
changed_before.latency_quantile_recent or 0,
542+
after_finish - before_start)
543+
544+
t.assert_gt(changed_after.latency_quantile_recent, 0,
545+
'Changed quantile has appropriate value')
546+
t.assert_le(changed_after.latency_quantile_recent, ok_quantile_max,
547+
'Changed quantile has appropriate value')
548+
end
549+
529550
local time_diff = changed_after.time - changed_before.time
530551

531552
t.assert_gt(time_diff, 0, 'Total time increase has appropriate value')

test/unit/stats_test.lua

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,9 +170,25 @@ for name, case in pairs(observe_cases) do
170170
t.assert_equals(changed.count, run_count, 'Count incremented by count of runs')
171171

172172
local sleep_time = helpers.simple_functions_params().sleep_time
173+
173174
t.assert_ge(changed.latency, sleep_time, 'Latency has appropriate value')
174175
t.assert_le(changed.latency, time_diffs[#time_diffs], 'Latency has appropriate value')
175176

177+
t.assert_ge(changed.latency_average, sleep_time, 'Average has appropriate value')
178+
t.assert_le(changed.latency_average, time_diffs[#time_diffs], 'Average has appropriate value')
179+
180+
if g.params.quantiles == true then
181+
t.assert_ge(
182+
changed.latency_quantile_recent,
183+
sleep_time,
184+
'Quantile has appropriate value')
185+
186+
t.assert_le(
187+
changed.latency_quantile_recent,
188+
time_diffs[#time_diffs],
189+
'Quantile has appropriate value')
190+
end
191+
176192
t.assert_ge(changed.time, sleep_time * run_count,
177193
'Total time increase has appropriate value')
178194
t.assert_le(changed.time, total_time, 'Total time increase has appropriate value')
@@ -187,6 +203,7 @@ for name, case in pairs(observe_cases) do
187203
{
188204
count = 0,
189205
latency = 0,
206+
latency_average = 0,
190207
time = 0
191208
},
192209
'Other status collectors initialized after observations'
@@ -387,15 +404,31 @@ for name, case in pairs(pairs_cases) do
387404

388405
t.assert_equals(changed.count, 1, 'Count incremented by 1')
389406

390-
t.assert_ge(changed.latency,
407+
t.assert_ge(
408+
changed.latency,
391409
params.sleep_time * (case.build_sleep_multiplier + case.iterations_expected),
392410
'Latency has appropriate value')
393411
t.assert_le(changed.latency, time_diff, 'Latency has appropriate value')
394412

413+
t.assert_ge(
414+
changed.latency_average,
415+
params.sleep_time * (case.build_sleep_multiplier + case.iterations_expected),
416+
'Average has appropriate value')
417+
t.assert_le(changed.latency_average, time_diff, 'Average has appropriate value')
418+
419+
if g.params.quantiles == true then
420+
t.assert_ge(
421+
changed.latency_quantile_recent,
422+
params.sleep_time * (case.build_sleep_multiplier + case.iterations_expected),
423+
'Quantile has appropriate value')
424+
425+
t.assert_le(changed.latency_quantile_recent, time_diff, 'Quantile has appropriate value')
426+
end
427+
395428
t.assert_ge(changed.time,
396429
params.sleep_time * (case.build_sleep_multiplier + case.iterations_expected),
397430
'Total time has appropriate value')
398-
t.assert_le(changed.time, time_diff, 'Total time has appropriate value')
431+
t.assert_le(changed.time, time_diff, 'Total time has appropriate value')
399432

400433
-- Other collectors (unchanged_coll: 'error' or 'ok')
401434
-- have been initialized and have default values.
@@ -407,6 +440,7 @@ for name, case in pairs(pairs_cases) do
407440
{
408441
count = 0,
409442
latency = 0,
443+
latency_average = 0,
410444
time = 0
411445
},
412446
'Other status collectors initialized after observations'

0 commit comments

Comments
 (0)