Skip to content

Commit 28454a0

Browse files
committed
Replica: Handle nodedowns and noproc when starting replica reader.
It is possible that the writer's node could go down in between querying the writer for it's log overview and then starting the replica reader. This commit handles that potentially common scenarion more gracefully.
1 parent d39da5a commit 28454a0

File tree

2 files changed

+30
-12
lines changed

2 files changed

+30
-12
lines changed

src/osiris_replica_reader.erl

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -67,19 +67,26 @@ stop(Pid) ->
6767
gen_server:cast(Pid, stop).
6868

6969
start(Node, ReplicaReaderConf) when is_map(ReplicaReaderConf) ->
70-
supervisor:start_child({osiris_replica_reader_sup, Node},
71-
#{id => make_ref(),
72-
start =>
70+
try
71+
supervisor:start_child({osiris_replica_reader_sup, Node},
72+
#{id => make_ref(),
73+
start =>
7374
{osiris_replica_reader, start_link,
7475
[ReplicaReaderConf]},
75-
%% replica readers should never be
76-
%% restarted by their sups
77-
%% instead they need to be re-started
78-
%% by their replica
79-
restart => temporary,
80-
shutdown => 5000,
81-
type => worker,
82-
modules => [osiris_replica_reader]}).
76+
%% replica readers should never be
77+
%% restarted by their sups
78+
%% instead they need to be re-started
79+
%% by their replica
80+
restart => temporary,
81+
shutdown => 5000,
82+
type => worker,
83+
modules => [osiris_replica_reader]})
84+
catch
85+
exit:{{nodedown, _} = Res, _Stack} ->
86+
{error, Res};
87+
exit:{noproc = Res, _Stack} ->
88+
{error, Res}
89+
end.
8390

8491
%%%===================================================================
8592
%%% gen_server callbacks

test/osiris_SUITE.erl

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,8 @@ all_tests() ->
8585
single_node_reader_counters,
8686
cluster_reader_counters,
8787
combine_ips_hosts_test,
88-
empty_last_segment].
88+
empty_last_segment,
89+
replica_reader_nodedown_noproc].
8990

9091
%% Isolated to avoid test interference
9192
ipv6_tests() ->
@@ -1895,6 +1896,16 @@ empty_last_segment(Config) ->
18951896
?assert(erlang:is_process_alive(Leader2)),
18961897
ok.
18971898

1899+
replica_reader_nodedown_noproc(_Config) ->
1900+
%% unit test to ensure we handle down nodes gracefully.
1901+
{error, {nodedown, 'banana@fruit'}} =
1902+
osiris_replica_reader:start('banana@fruit', #{}),
1903+
1904+
_ = application:stop(osiris),
1905+
{error, noproc} =
1906+
osiris_replica_reader:start(node(), #{}),
1907+
ok.
1908+
18981909
%% Utility
18991910

19001911
write_n(Pid, N, Written) ->

0 commit comments

Comments
 (0)