Skip to content

Commit 8a440ed

Browse files
committed
api: fix INIT state stuck
Sometimes, instance could enter the queue initialization while still not running (for example, left in the orphan mode). This resulted in "lazy start". But Tarantool does not call `box.cfg {}` after leaving orphan mode, so queue could stuck in the `INIT` state. Now we wait in the background for instances, that are not running. It is similar to lazy init for read-only instances. Note that this fix works only for Tarantool versions >= 2.10.0. This is because of used watchers. Closes #226
1 parent 5f2b145 commit 8a440ed

File tree

3 files changed

+138
-4
lines changed

3 files changed

+138
-4
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
66
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
77

8+
## [Unreleased]
9+
10+
### Fixed
11+
12+
- Stuck in `INIT` state if an instance failed to enter the `running` mode
13+
in time (#226). This fix works only for Tarantool versions >= 2.10.0.
14+
815
## [1.3.3] - 2023-09-13
916

1017
### Fixed

queue/init.lua

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1+
local fiber = require('fiber')
2+
13
local abstract = require('queue.abstract')
24
local queue_state = require('queue.abstract.queue_state')
5+
local qc = require('queue.compat')
36
local queue = nil
47

58
-- load all core drivers
@@ -11,6 +14,10 @@ local core_drivers = {
1114
limfifottl = require('queue.abstract.driver.limfifottl')
1215
}
1316

17+
-- since:
18+
-- https://github.com/locker/tarantool/commit/8cf5151cb4f05cee3fd0ea831add2b3187a01fe4
19+
local watchers_supported = qc.check_version({2, 10, 0})
20+
1421
local function register_driver(driver_name, tube_ctr)
1522
if type(tube_ctr.create_space) ~= 'function' or
1623
type(tube_ctr.new) ~= 'function' then
@@ -62,6 +69,19 @@ local orig_call = nil
6269

6370
local wrapper_impl
6471

72+
local function running_waiter()
73+
fiber.name('queue running waiter')
74+
local wait_cond = fiber.cond()
75+
local w = box.watch('box.status', function(_, new_status)
76+
if new_status.status == 'running' then
77+
wait_cond:signal()
78+
end
79+
end)
80+
wait_cond:wait()
81+
w:unregister()
82+
return wrapper_impl()
83+
end
84+
6585
local function cfg_wrapper(...)
6686
box.cfg = orig_cfg
6787
return wrapper_impl(...)
@@ -79,10 +99,15 @@ local function wrap_box_cfg()
7999
orig_cfg = box.cfg
80100
box.cfg = cfg_wrapper
81101
elseif type(box.cfg) == 'table' then
82-
-- box.cfg after the first box.cfg call
83-
local cfg_mt = getmetatable(box.cfg)
84-
orig_call = cfg_mt.__call
85-
cfg_mt.__call = cfg_call_wrapper
102+
if watchers_supported and box.info.status ~= 'running' then
103+
-- Wait for the running state and initialize the queue.
104+
fiber.new(running_waiter)
105+
else
106+
-- box.cfg after the first box.cfg call
107+
local cfg_mt = getmetatable(box.cfg)
108+
orig_call = cfg_mt.__call
109+
cfg_mt.__call = cfg_call_wrapper
110+
end
86111
else
87112
error('The box.cfg type is unexpected: ' .. type(box.cfg))
88113
end

t/230-orphan-not-stalling-init.t

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
#!/usr/bin/env tarantool
2+
3+
local test = require('tap').test('')
4+
local queue = require('queue')
5+
local tnt = require('t.tnt')
6+
local fio = require('fio')
7+
local fiber = require('fiber')
8+
9+
rawset(_G, 'queue', require('queue'))
10+
11+
local qc = require('queue.compat')
12+
if not qc.check_version({2, 10, 0}) then
13+
require('log').info('Tests skipped, tarantool version < 2.10.0')
14+
return
15+
end
16+
17+
local snapdir_optname = qc.snapdir_optname
18+
local logger_optname = qc.logger_optname
19+
20+
test:plan(1)
21+
22+
test:test('Check orphan mode not stalling queue', function(test)
23+
test:plan(4)
24+
local engine = os.getenv('ENGINE') or 'memtx'
25+
tnt.cluster.cfg{}
26+
27+
local dir_replica = fio.tempdir()
28+
local cmd_replica = {
29+
arg[-1],
30+
'-e',
31+
[[
32+
box.cfg {
33+
replication = {
34+
'replicator:[email protected]:3399',
35+
'replicator:[email protected]:3398',
36+
},
37+
listen = '127.0.0.1:3396',
38+
wal_dir = ']] .. dir_replica .. '\'' ..
39+
',' .. snapdir_optname() .. ' = \'' .. dir_replica .. '\'' ..
40+
',' .. logger_optname() .. ' = \'' ..
41+
fio.pathjoin(dir_replica, 'tarantool.log') .. '\'' ..
42+
'}'
43+
}
44+
45+
replica = require('popen').new(cmd_replica, {
46+
stdin = 'devnull',
47+
stdout = 'devnull',
48+
stderr = 'devnull',
49+
})
50+
51+
local attempts = 0
52+
-- Wait for replica to connect.
53+
while box.info.replication[3] == nil or box.info.replication[3].downstream.status ~= 'follow' do
54+
attempts = attempts + 1
55+
if attempts == 30 then
56+
error('wait for replica connection')
57+
end
58+
fiber.sleep(0.1)
59+
end
60+
61+
local conn = require('net.box').connect('127.0.0.1:3396')
62+
63+
conn:eval([[
64+
box.cfg{
65+
replication = {
66+
'replicator:[email protected]:3399',
67+
'replicator:[email protected]:3398',
68+
'replicator:[email protected]:3396',
69+
},
70+
listen = '127.0.0.1:3397',
71+
replication_connect_quorum = 4,
72+
}
73+
]])
74+
75+
conn:eval('rawset(_G, "queue", require("queue"))')
76+
77+
test:is(conn:call('queue.state'), 'INIT', 'check queue state')
78+
test:is(conn:call('box.info').ro, true, 'check read only')
79+
test:is(conn:call('box.info').ro_reason, 'orphan', 'check ro reason')
80+
81+
conn:eval('box.cfg{replication_connect_quorum = 2}')
82+
83+
local attempts = 0
84+
while true do
85+
if conn:call('queue.state') == 'RUNNING' then
86+
test:is(conn:call('queue.state'), 'RUNNING',
87+
'check queue state after orphan')
88+
return
89+
end
90+
attempts = attempts + 1
91+
if attempts == 10 then
92+
break
93+
end
94+
fiber.sleep(0.1)
95+
end
96+
test:is(conn:call('queue.state'), 'RUNNING', 'check queue state after orphan')
97+
end)
98+
99+
rawset(_G, 'queue', nil)
100+
tnt.finish()
101+
os.exit(test:check() and 0 or 1)
102+
-- vim: set ft=lua :

0 commit comments

Comments
 (0)