Skip to content

Commit 854cbb4

Browse files
committed
rabbit_feature_flags: Sync enabled feature flags differently on virgin node
If a virgin node starts as clustered (thanks to peer discovery), we need to mark feature flags already enabled remotely as enabled locally too. We can't do a regular cluster sync because remote nodes may have required feature flags which are disabled locally on the virgin node. Therefore, those nodes don't have the migration code for these feature flags anymore and the feature flags' state can't be changed. By doing this special sync, we allow a clustered virgin node to join a cluster with remote nodes having required feature flags.
1 parent 5a8530c commit 854cbb4

File tree

4 files changed

+261
-8
lines changed

4 files changed

+261
-8
lines changed

deps/rabbit/src/rabbit_feature_flags.erl

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1872,7 +1872,59 @@ sync_cluster_v1([], NodeIsVirgin, _) ->
18721872
"current state"),
18731873
ok
18741874
end;
1875-
sync_cluster_v1(Nodes, _, Timeout) ->
1875+
sync_cluster_v1(Nodes, true = _NodeIsVirgin, Timeout) ->
1876+
[Node | _] = Nodes,
1877+
rabbit_log_feature_flags:debug(
1878+
"Feature flags: marking feature flags required remotely as enabled "
1879+
"locally because this node is virgin; using ~s as the remote node",
1880+
[Node]),
1881+
?assertNotEqual(node(), Node),
1882+
Ret1 = rpc:call(Node, ?MODULE, list, [enabled], Timeout),
1883+
case Ret1 of
1884+
FeatureFlags when is_map(FeatureFlags) ->
1885+
%% We only consider required feature flags (for which there is no
1886+
%% migration code on nodes where it is required). Remotely enabled
1887+
%% feature flags will be handled by the regular sync.
1888+
RequiredFeatureFlags =
1889+
maps:filter(
1890+
fun(_, FeatureProps) ->
1891+
required =:= get_stability(FeatureProps)
1892+
end, FeatureFlags),
1893+
FeatureNames = lists:sort(maps:keys(RequiredFeatureFlags)),
1894+
rabbit_log_feature_flags:error(
1895+
"Feature flags: list of feature flags to automatically mark as "
1896+
"enabled on this virgin node: ~0p",
1897+
[FeatureNames]),
1898+
1899+
%% We mark them as enabled directly (no migration code is executed)
1900+
%% because this node is virgin (i.e. no files on disk) and will
1901+
%% inherit the database from remote nodes.
1902+
Ret2 =
1903+
lists:foldl(
1904+
fun
1905+
(FeatureName, ok) ->
1906+
case is_supported_locally(FeatureName) of
1907+
true -> mark_as_enabled_locally(FeatureName, true);
1908+
false -> ok
1909+
end;
1910+
(_, Acc) ->
1911+
Acc
1912+
end, ok, FeatureNames),
1913+
1914+
%% We continue with the regular sync process. It will handle other
1915+
%% enabled feature flags and unsupported ones too.
1916+
case Ret2 of
1917+
ok -> sync_feature_flags_with_cluster(Nodes, false, Timeout);
1918+
_ -> Ret2
1919+
end;
1920+
{badrpc, _} = Reason ->
1921+
rabbit_log_feature_flags:error(
1922+
"Feature flags: failed to query remotely enabled feature flags "
1923+
"on node ~s: ~p",
1924+
[Node, Reason]),
1925+
{error, Reason}
1926+
end;
1927+
sync_cluster_v1(Nodes, _NodeIsVirgin, Timeout) ->
18761928
case sync_feature_flags_v2_first(Nodes, Timeout) of
18771929
true ->
18781930
?LOG_DEBUG(

deps/rabbit/src/rabbit_ff_controller.erl

Lines changed: 101 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -577,7 +577,7 @@ enable_with_registry_locked(
577577
[FeatureName],
578578
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
579579

580-
case update_feature_state_and_enable(Inventory, FeatureName) of
580+
case check_required_and_enable(Inventory, FeatureName) of
581581
{ok, _Inventory} = Ok ->
582582
?LOG_NOTICE(
583583
"Feature flags: `~ts` enabled",
@@ -593,6 +593,71 @@ enable_with_registry_locked(
593593
end
594594
end.
595595

596+
-spec check_required_and_enable(Inventory, FeatureName) -> Ret when
597+
Inventory :: rabbit_feature_flags:cluster_inventory(),
598+
FeatureName :: rabbit_feature_flags:feature_name(),
599+
Ret :: {ok, Inventory} | {error, Reason},
600+
Reason :: term().
601+
602+
check_required_and_enable(
603+
#{feature_flags := FeatureFlags,
604+
states_per_node := _} = Inventory,
605+
FeatureName) ->
606+
%% Required feature flags vs. virgin nodes.
607+
FeatureProps = maps:get(FeatureName, FeatureFlags),
608+
Stability = rabbit_feature_flags:get_stability(FeatureProps),
609+
NodesWhereDisabled = list_nodes_where_feature_flag_is_disabled(
610+
Inventory, FeatureName),
611+
612+
MarkDirectly = case Stability of
613+
required ->
614+
?LOG_DEBUG(
615+
"Feature flags: `~s`: the feature flag is "
616+
"required on some nodes; list virgin nodes "
617+
"to determine if the feature flag can simply "
618+
"be marked as enabled",
619+
[FeatureName],
620+
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
621+
VirginNodesWhereDisabled =
622+
lists:filter(
623+
fun(Node) ->
624+
case is_virgin_node(Node) of
625+
IsVirgin when is_boolean(IsVirgin) ->
626+
IsVirgin;
627+
undefined ->
628+
false
629+
end
630+
end, NodesWhereDisabled),
631+
VirginNodesWhereDisabled =:= NodesWhereDisabled;
632+
_ ->
633+
false
634+
end,
635+
636+
case MarkDirectly of
637+
false ->
638+
case Stability of
639+
required ->
640+
?LOG_DEBUG(
641+
"Feature flags: `~s`: some nodes where the feature "
642+
"flag is disabled are not virgin, we need to perform "
643+
"a regular sync",
644+
[FeatureName],
645+
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS});
646+
_ ->
647+
ok
648+
end,
649+
update_feature_state_and_enable(Inventory, FeatureName);
650+
true ->
651+
?LOG_DEBUG(
652+
"Feature flags: `~s`: all nodes where the feature flag is "
653+
"disabled are virgin, we can directly mark it as enabled "
654+
"there",
655+
[FeatureName],
656+
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
657+
mark_as_enabled_on_nodes(
658+
NodesWhereDisabled, Inventory, FeatureName, true)
659+
end.
660+
596661
-spec update_feature_state_and_enable(Inventory, FeatureName) -> Ret when
597662
Inventory :: rabbit_feature_flags:cluster_inventory(),
598663
FeatureName :: rabbit_feature_flags:feature_name(),
@@ -646,6 +711,14 @@ update_feature_state_and_enable(
646711
Error
647712
end.
648713

714+
is_virgin_node(Node) ->
715+
case rpc_call(Node, rabbit_mnesia, is_virgin_node, [], ?TIMEOUT) of
716+
IsVirgin when is_boolean(IsVirgin) ->
717+
IsVirgin;
718+
{error, _} ->
719+
undefined
720+
end.
721+
649722
-spec do_enable(Inventory, FeatureName, Nodes) -> Ret when
650723
Inventory :: rabbit_feature_flags:cluster_inventory(),
651724
FeatureName :: rabbit_feature_flags:feature_name(),
@@ -740,7 +813,7 @@ collect_inventory_on_nodes(Nodes, Timeout) ->
740813
{ok, #{feature_flags := FeatureFlags,
741814
applications_per_node := ScannedAppsPerNode,
742815
states_per_node := StatesPerNode} = Inventory}) ->
743-
FeatureFlags2 = maps:merge(FeatureFlags, FeatureFlags1),
816+
FeatureFlags2 = merge_feature_flags(FeatureFlags, FeatureFlags1),
744817
ScannedAppsPerNode1 = ScannedAppsPerNode#{Node => ScannedApps},
745818
StatesPerNode1 = StatesPerNode#{Node => FeatureStates},
746819
Inventory1 = Inventory#{
@@ -756,6 +829,32 @@ collect_inventory_on_nodes(Nodes, Timeout) ->
756829
Error
757830
end, {ok, Inventory0}, Rets).
758831

832+
merge_feature_flags(FeatureFlagsA, FeatureFlagsB) ->
833+
FeatureFlags = maps:merge(FeatureFlagsA, FeatureFlagsB),
834+
maps:map(
835+
fun(FeatureName, FeatureProps) ->
836+
FeaturePropsA = maps:get(FeatureName, FeatureFlagsA, #{}),
837+
FeaturePropsB = maps:get(FeatureName, FeatureFlagsB, #{}),
838+
839+
%% There is a rank between stability levels. If a feature flag
840+
%% is required somewhere, it is required globally. Otherwise if
841+
%% it is stable somewhere, it is stable globally.
842+
StabilityA = rabbit_feature_flags:get_stability(FeaturePropsA),
843+
StabilityB = rabbit_feature_flags:get_stability(FeaturePropsB),
844+
Stability = case {StabilityA, StabilityB} of
845+
{required, _} -> required;
846+
{_, required} -> required;
847+
{stable, _} -> stable;
848+
{_, stable} -> stable;
849+
_ -> experimental
850+
end,
851+
852+
FeatureProps1 = FeatureProps#{stability => Stability},
853+
FeatureProps2 = maps:remove(migration_fun, FeatureProps1),
854+
FeatureProps3 = maps:remove(callbacks, FeatureProps2),
855+
FeatureProps3
856+
end, FeatureFlags).
857+
759858
-spec list_feature_flags_enabled_somewhere(Inventory, HandleStateChanging) ->
760859
Ret when
761860
Inventory :: rabbit_feature_flags:cluster_inventory(),

deps/rabbit/src/rabbit_mnesia.erl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
is_registered_process_alive/1,
2626
cluster_nodes/1,
2727
node_type/0,
28+
is_virgin_node/0,
2829
dir/0,
2930
cluster_status_from_mnesia/0,
3031

deps/rabbit/test/feature_flags_v2_SUITE.erl

Lines changed: 106 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@
3939
enable_feature_flag_in_cluster_and_add_member_concurrently_mfv2/1,
4040
enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv1/1,
4141
enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2/1,
42-
enable_feature_flag_with_post_enable/1
42+
enable_feature_flag_with_post_enable/1,
43+
have_required_feature_flag_in_cluster_and_add_member_with_it_disabled/1
4344
]).
4445

4546
suite() ->
@@ -70,7 +71,8 @@ groups() ->
7071
enable_feature_flag_in_cluster_and_add_member_concurrently_mfv2,
7172
enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv1,
7273
enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2,
73-
enable_feature_flag_with_post_enable
74+
enable_feature_flag_with_post_enable,
75+
have_required_feature_flag_in_cluster_and_add_member_with_it_disabled
7476
]}
7577
],
7678
[
@@ -145,7 +147,7 @@ start_slave_node(Parent, Config, Testcase, N) ->
145147

146148
TestCodePath = filename:dirname(code:which(?MODULE)),
147149
true = rpc:call(Node, code, add_path, [TestCodePath]),
148-
ok = run_on_node(Node, fun setup_slave_node/1, [Config]),
150+
ok = run_on_node(Node, fun setup_slave_node/2, [Config, Testcase]),
149151
ct:pal("- Slave node `~ts` configured", [Node]),
150152
Parent ! {node, self(), Node}.
151153

@@ -177,11 +179,18 @@ run_on_node(Node, Fun, Args) ->
177179
%% Slave node configuration.
178180
%% -------------------------------------------------------------------
179181

180-
setup_slave_node(Config) ->
182+
setup_slave_node(Config, Testcase) ->
181183
ok = setup_logger(),
182184
ok = setup_feature_flags_file(Config),
183185
ok = start_controller(),
184-
ok = maybe_enable_feature_flags_v2(Config),
186+
case Testcase of
187+
have_required_feature_flag_in_cluster_and_add_member_with_it_disabled
188+
->
189+
%% This testcase handles the `feature_flags_v2' state itself.
190+
ok;
191+
_ ->
192+
ok = maybe_enable_feature_flags_v2(Config)
193+
end,
185194
ok.
186195

187196
setup_logger() ->
@@ -1534,3 +1543,95 @@ enable_feature_flag_with_post_enable(Config) ->
15341543
[])
15351544
|| Node <- AllNodes],
15361545
ok.
1546+
1547+
have_required_feature_flag_in_cluster_and_add_member_with_it_disabled(
1548+
Config) ->
1549+
AllNodes = [NewNode | [FirstNode | _] = Nodes] = ?config(nodes, Config),
1550+
connect_nodes(Nodes),
1551+
override_running_nodes([NewNode]),
1552+
override_running_nodes(Nodes),
1553+
1554+
FeatureName = ?FUNCTION_NAME,
1555+
FeatureFlags = #{FeatureName =>
1556+
#{provided_by => ?MODULE,
1557+
stability => stable}},
1558+
RequiredFeatureFlags = #{FeatureName =>
1559+
#{provided_by => ?MODULE,
1560+
stability => required}},
1561+
inject_on_nodes([NewNode], FeatureFlags),
1562+
inject_on_nodes(Nodes, RequiredFeatureFlags),
1563+
1564+
ct:pal(
1565+
"Checking the feature flag is supported but enabled on existing "
1566+
"cluster only"),
1567+
ok = run_on_node(
1568+
NewNode,
1569+
fun() ->
1570+
?assert(rabbit_feature_flags:is_supported(FeatureName)),
1571+
?assertNot(rabbit_feature_flags:is_enabled(FeatureName)),
1572+
1573+
ok = maybe_enable_feature_flags_v2(Config),
1574+
ok
1575+
end,
1576+
[]),
1577+
_ = [ok =
1578+
run_on_node(
1579+
Node,
1580+
fun() ->
1581+
?assert(rabbit_feature_flags:is_supported(FeatureName)),
1582+
?assert(rabbit_feature_flags:is_enabled(FeatureName)),
1583+
1584+
%% We enable `feature_flags_v2' on the cluster, regardless
1585+
%% of the common_test group. The reason is that we want to
1586+
%% test that a bug in the sync between a
1587+
%% feature_flags_v2-based cluster and a
1588+
%% feature_flags_v1-based joining virgin node is fixed.
1589+
ok = rabbit_feature_flags:enable(feature_flags_v2),
1590+
ok
1591+
end,
1592+
[])
1593+
|| Node <- Nodes],
1594+
1595+
%% Check compatibility between NewNodes and Nodes.
1596+
ok = run_on_node(
1597+
NewNode,
1598+
fun() ->
1599+
?assertEqual(
1600+
ok,
1601+
rabbit_feature_flags:check_node_compatibility(
1602+
FirstNode)),
1603+
ok
1604+
end, []),
1605+
1606+
%% Add node to cluster and synchronize feature flags.
1607+
connect_nodes(AllNodes),
1608+
override_running_nodes(AllNodes),
1609+
ct:pal(
1610+
"Synchronizing feature flags in the expanded cluster~n"
1611+
"~n"
1612+
"NOTE: Error messages about crashed migration functions can be "
1613+
"ignored for feature~n"
1614+
" flags other than `~ts`~n"
1615+
" because they assume they run inside RabbitMQ.",
1616+
[FeatureName]),
1617+
ok = run_on_node(
1618+
NewNode,
1619+
fun() ->
1620+
?assertEqual(
1621+
ok,
1622+
rabbit_feature_flags:sync_feature_flags_with_cluster(
1623+
Nodes, true)),
1624+
ok
1625+
end, []),
1626+
1627+
ct:pal("Checking the feature flag is enabled in the expanded cluster"),
1628+
_ = [ok =
1629+
run_on_node(
1630+
Node,
1631+
fun() ->
1632+
?assert(rabbit_feature_flags:is_enabled(FeatureName)),
1633+
ok
1634+
end,
1635+
[])
1636+
|| Node <- AllNodes],
1637+
ok.

0 commit comments

Comments
 (0)