Skip to content

Commit 9a730ca

Browse files
committed
Provide an API to get storages initialization state
There is an issue with using CRUD functionality if not all storages are up. New function is added to get the information about storages state: initialized or not. So, a user can poll state and wait for storages to be initialized before making CRUD calls. Resolves #229
1 parent 3b4609b commit 9a730ca

File tree

7 files changed

+323
-6
lines changed

7 files changed

+323
-6
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
88
## [Unreleased]
99

1010
### Added
11+
* `crud.storage_info` function to get storages status (#229, PR #299).
1112

1213
### Changed
1314

README.md

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ It also provides the `crud-storage` and `crud-router` roles for
3232
- [Cut extra objects](#cut-extra-objects)
3333
- [Truncate](#truncate)
3434
- [Len](#len)
35+
- [Storage info](#storage-info)
3536
- [Count](#count)
3637
- [Call options for crud methods](#call-options-for-crud-methods)
3738
- [Statistics](#statistics)
@@ -1074,6 +1075,51 @@ crud.len('customers')
10741075
...
10751076
```
10761077

1078+
### Storage info
1079+
1080+
```lua
1081+
-- Get storages status
1082+
local result, err = crud.storage_info(opts)
1083+
```
1084+
1085+
where:
1086+
1087+
* `opts`:
1088+
* `timeout` (`?number`) - maximum time (in seconds, default: 2) to wait for response from
1089+
cluster instances.
1090+
1091+
Returns storages status table by instance UUID or nil with error. Status table fields:
1092+
* `status` contains a string representing the status:
1093+
* `"running"` - storage is initialized and running.
1094+
* `"uninitialized"` - storage is not initialized or disabled.
1095+
* `"error"` - error getting the status from a storage. Connection error, for example.
1096+
* `is_master` is `true` if an instance is a master, `false` - otherwise.
1097+
* `message` is `nil` unless a problem occurs with getting storage status.
1098+
1099+
1100+
**Example:**
1101+
1102+
```lua
1103+
crud.storage_info()
1104+
```
1105+
```
1106+
---
1107+
- fe1b5bd9-42d4-4955-816c-3aa015e0eb81:
1108+
status: running
1109+
is_master: true
1110+
a1eefe51-9869-4c4c-9676-76431b08c97a:
1111+
status: running
1112+
is_master: true
1113+
777415f4-d656-440e-8834-7124b7267b6d:
1114+
status: uninitialized
1115+
is_master: false
1116+
e1b2e202-b0f7-49cd-b0a2-6b3a584f995e:
1117+
status: error
1118+
message: 'connect, called on fd 36, aka 127.0.0.1:49762: Connection refused'
1119+
is_master: false
1120+
...
1121+
```
1122+
10771123
### Count
10781124

10791125
`CRUD` supports multi-conditional count, treating a cluster as a single space.
@@ -1240,7 +1286,7 @@ returns. `count` is the total requests count since instance start
12401286
or stats restart. `time` is the total time of requests execution.
12411287
`latency_average` is `time` / `count`.
12421288
`latency_quantile_recent` is the 0.99 quantile of request execution
1243-
time for a recent period (see
1289+
time for a recent period (see
12441290
[`metrics` summary API](https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary)).
12451291
It is computed only if `metrics` driver is used and quantiles are
12461292
enabled. `latency_quantile_recent` value may be `-nan` if there

crud.lua

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,10 @@ crud.stats = stats.get
138138
-- @function reset_stats
139139
crud.reset_stats = stats.reset
140140

141+
-- @refer utils.storage_info
142+
-- @function storage_info
143+
crud.storage_info = utils.storage_info
144+
141145
--- Initializes crud on node
142146
--
143147
-- Exports all functions that are used for calls
@@ -165,6 +169,8 @@ function crud.init_storage()
165169
count.init()
166170
borders.init()
167171
sharding_metadata.init()
172+
173+
_G._crud.storage_info_on_storage = utils.storage_info_on_storage
168174
end
169175

170176
function crud.init_router()

crud/common/call.lua

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ local dev_checks = require('crud.common.dev_checks')
55
local utils = require('crud.common.utils')
66
local sharding_utils = require('crud.common.sharding.utils')
77
local fiber_clock = require('fiber').clock
8+
local const = require('crud.common.const')
89

910
local BaseIterator = require('crud.common.map_call_cases.base_iter')
1011
local BasePostprocessor = require('crud.common.map_call_cases.base_postprocessor')
@@ -13,8 +14,6 @@ local CallError = errors.new_class('CallError')
1314

1415
local call = {}
1516

16-
call.DEFAULT_VSHARD_CALL_TIMEOUT = 2
17-
1817
function call.get_vshard_call_name(mode, prefer_replica, balance)
1918
dev_checks('string', '?boolean', '?boolean')
2019

@@ -84,7 +83,7 @@ function call.map(func_name, func_args, opts)
8483
return nil, err
8584
end
8685

87-
local timeout = opts.timeout or call.DEFAULT_VSHARD_CALL_TIMEOUT
86+
local timeout = opts.timeout or const.DEFAULT_VSHARD_CALL_TIMEOUT
8887

8988
local iter = opts.iter
9089
if iter == nil then
@@ -149,7 +148,7 @@ function call.single(bucket_id, func_name, func_args, opts)
149148
return nil, err
150149
end
151150

152-
local timeout = opts.timeout or call.DEFAULT_VSHARD_CALL_TIMEOUT
151+
local timeout = opts.timeout or const.DEFAULT_VSHARD_CALL_TIMEOUT
153152

154153
local res, err = vshard.router[vshard_call_name](bucket_id, func_name, func_args, {
155154
timeout = timeout,
@@ -171,7 +170,7 @@ function call.any(func_name, func_args, opts)
171170
timeout = '?number',
172171
})
173172

174-
local timeout = opts.timeout or call.DEFAULT_VSHARD_CALL_TIMEOUT
173+
local timeout = opts.timeout or const.DEFAULT_VSHARD_CALL_TIMEOUT
175174

176175
local replicasets, err = vshard.router.routeall()
177176
if replicasets == nil then

crud/common/const.lua

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,6 @@ const.SHARDING_RELOAD_RETRIES_NUM = 1
88
const.NEED_SCHEMA_RELOAD = 0x0001000
99
const.NEED_SHARDING_RELOAD = 0x0001001
1010

11+
const.DEFAULT_VSHARD_CALL_TIMEOUT = 2
12+
1113
return const

crud/common/utils.lua

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ local ffi = require('ffi')
33
local vshard = require('vshard')
44
local fun = require('fun')
55
local bit = require('bit')
6+
local log = require('log')
67

78
local const = require('crud.common.const')
89
local schema = require('crud.common.schema')
@@ -15,6 +16,8 @@ local ShardingError = errors.new_class('ShardingError', {capture_stack = false})
1516
local GetSpaceFormatError = errors.new_class('GetSpaceFormatError', {capture_stack = false})
1617
local FilterFieldsError = errors.new_class('FilterFieldsError', {capture_stack = false})
1718
local NotInitializedError = errors.new_class('NotInitialized')
19+
local StorageInfoError = errors.new_class('StorageInfoError')
20+
local fiber_clock = require('fiber').clock
1821

1922
local utils = {}
2023

@@ -748,4 +751,87 @@ function utils.list_slice(list, start_index, end_index)
748751
return slice
749752
end
750753

754+
--- Polls replicas for storage state
755+
--
756+
-- @function storage_info
757+
--
758+
-- @tparam ?number opts.timeout
759+
-- Function call timeout
760+
--
761+
-- @return a table of storage states by replica uuid.
762+
function utils.storage_info(opts)
763+
local replicasets, err = vshard.router.routeall()
764+
if replicasets == nil then
765+
return nil, StorageInfoError:new("Failed to get all replicasets: %s", err.err)
766+
end
767+
768+
opts = opts or {}
769+
770+
local futures_by_replicas = {}
771+
local replica_state_by_uuid = {}
772+
local async_opts = {is_async = true}
773+
local timeout = opts.timeout or const.DEFAULT_VSHARD_CALL_TIMEOUT
774+
775+
for _, replicaset in pairs(replicasets) do
776+
for replica_uuid, replica in pairs(replicaset.replicas) do
777+
replica_state_by_uuid[replica_uuid] = {
778+
status = "error",
779+
is_master = replicaset.master == replica
780+
}
781+
local ok, res = pcall(replica.conn.call, replica.conn, "_crud.storage_info_on_storage",
782+
{}, async_opts)
783+
if ok then
784+
futures_by_replicas[replica_uuid] = res
785+
else
786+
local err_msg = string.format("Error getting storage info for %s", replica_uuid)
787+
if res ~= nil then
788+
log.error("%s: %s", err_msg, res)
789+
replica_state_by_uuid[replica_uuid].message = tostring(res)
790+
else
791+
log.error(err_msg)
792+
replica_state_by_uuid[replica_uuid].message = err_msg
793+
end
794+
end
795+
end
796+
end
797+
798+
local deadline = fiber_clock() + timeout
799+
for replica_uuid, future in pairs(futures_by_replicas) do
800+
local wait_timeout = deadline - fiber_clock()
801+
if wait_timeout < 0 then
802+
wait_timeout = 0
803+
end
804+
805+
local result, err = future:wait_result(wait_timeout)
806+
if result == nil then
807+
future:discard()
808+
local err_msg = string.format("Error getting storage info for %s", replica_uuid)
809+
if err ~= nil then
810+
if err.type == 'ClientError' and err.code == box.error.NO_SUCH_PROC then
811+
replica_state_by_uuid[replica_uuid].status = "uninitialized"
812+
else
813+
log.error("%s: %s", err_msg, err)
814+
replica_state_by_uuid[replica_uuid].message = tostring(err)
815+
end
816+
else
817+
log.error(err_msg)
818+
replica_state_by_uuid[replica_uuid].message = err_msg
819+
end
820+
else
821+
replica_state_by_uuid[replica_uuid].status = result[1].status or "uninitialized"
822+
end
823+
end
824+
825+
return replica_state_by_uuid
826+
end
827+
828+
--- Storage status information.
829+
--
830+
-- @function storage_info_on_storage
831+
--
832+
-- @return a table with storage status.
833+
function utils.storage_info_on_storage()
834+
return {status = "running"}
835+
end
836+
751837
return utils

0 commit comments

Comments
 (0)