From 2f301b4a38415aaf1f4e33c452764ee56c5d57b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?= Date: Wed, 4 Jan 2023 16:18:49 +0100 Subject: [PATCH] rabbit_feature_flags: Sync enabled feature flags differently on virgin node If a virgin node starts as clustered (thanks to peer discovery), we need to mark feature flags already enabled remotely as enabled locally too. We can't do a regular cluster sync because remote nodes may have required feature flags which are disabled locally on the virgin node. Therefore, those nodes don't have the migration code for these feature flags anymore and the feature flags' state can't be changed. By doing this special sync, we allow a clustered virgin node to join a cluster with remote nodes having required feature flags. --- deps/rabbit/src/rabbit_feature_flags.erl | 54 ++++- deps/rabbit/src/rabbit_ff_controller.erl | 103 +++++++- deps/rabbit/src/rabbit_mnesia.erl | 1 + deps/rabbit/test/feature_flags_v2_SUITE.erl | 253 +++++++++++++++++++- 4 files changed, 401 insertions(+), 10 deletions(-) diff --git a/deps/rabbit/src/rabbit_feature_flags.erl b/deps/rabbit/src/rabbit_feature_flags.erl index f68bd5e4b4d0..d61a882c4a7e 100644 --- a/deps/rabbit/src/rabbit_feature_flags.erl +++ b/deps/rabbit/src/rabbit_feature_flags.erl @@ -1872,7 +1872,59 @@ sync_cluster_v1([], NodeIsVirgin, _) -> "current state"), ok end; -sync_cluster_v1(Nodes, _, Timeout) -> +sync_cluster_v1(Nodes, true = _NodeIsVirgin, Timeout) -> + [Node | _] = Nodes, + rabbit_log_feature_flags:debug( + "Feature flags: marking feature flags required remotely as enabled " + "locally because this node is virgin; using ~s as the remote node", + [Node]), + ?assertNotEqual(node(), Node), + Ret1 = rpc:call(Node, ?MODULE, list, [enabled], Timeout), + case Ret1 of + FeatureFlags when is_map(FeatureFlags) -> + %% We only consider required feature flags (for which there is no + %% migration code on nodes where it is required). Remotely enabled + %% feature flags will be handled by the regular sync. + RequiredFeatureFlags = + maps:filter( + fun(_, FeatureProps) -> + required =:= get_stability(FeatureProps) + end, FeatureFlags), + FeatureNames = lists:sort(maps:keys(RequiredFeatureFlags)), + rabbit_log_feature_flags:error( + "Feature flags: list of feature flags to automatically mark as " + "enabled on this virgin node: ~0p", + [FeatureNames]), + + %% We mark them as enabled directly (no migration code is executed) + %% because this node is virgin (i.e. no files on disk) and will + %% inherit the database from remote nodes. + Ret2 = + lists:foldl( + fun + (FeatureName, ok) -> + case is_supported_locally(FeatureName) of + true -> mark_as_enabled_locally(FeatureName, true); + false -> ok + end; + (_, Acc) -> + Acc + end, ok, FeatureNames), + + %% We continue with the regular sync process. It will handle other + %% enabled feature flags and unsupported ones too. + case Ret2 of + ok -> sync_feature_flags_with_cluster(Nodes, false, Timeout); + _ -> Ret2 + end; + {badrpc, _} = Reason -> + rabbit_log_feature_flags:error( + "Feature flags: failed to query remotely enabled feature flags " + "on node ~s: ~p", + [Node, Reason]), + {error, Reason} + end; +sync_cluster_v1(Nodes, _NodeIsVirgin, Timeout) -> case sync_feature_flags_v2_first(Nodes, Timeout) of true -> ?LOG_DEBUG( diff --git a/deps/rabbit/src/rabbit_ff_controller.erl b/deps/rabbit/src/rabbit_ff_controller.erl index c315dd34c6b8..a4a796e7edb9 100644 --- a/deps/rabbit/src/rabbit_ff_controller.erl +++ b/deps/rabbit/src/rabbit_ff_controller.erl @@ -577,7 +577,7 @@ enable_with_registry_locked( [FeatureName], #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}), - case update_feature_state_and_enable(Inventory, FeatureName) of + case check_required_and_enable(Inventory, FeatureName) of {ok, _Inventory} = Ok -> ?LOG_NOTICE( "Feature flags: `~ts` enabled", @@ -593,6 +593,71 @@ enable_with_registry_locked( end end. +-spec check_required_and_enable(Inventory, FeatureName) -> Ret when + Inventory :: rabbit_feature_flags:cluster_inventory(), + FeatureName :: rabbit_feature_flags:feature_name(), + Ret :: {ok, Inventory} | {error, Reason}, + Reason :: term(). + +check_required_and_enable( + #{feature_flags := FeatureFlags, + states_per_node := _} = Inventory, + FeatureName) -> + %% Required feature flags vs. virgin nodes. + FeatureProps = maps:get(FeatureName, FeatureFlags), + Stability = rabbit_feature_flags:get_stability(FeatureProps), + NodesWhereDisabled = list_nodes_where_feature_flag_is_disabled( + Inventory, FeatureName), + + MarkDirectly = case Stability of + required -> + ?LOG_DEBUG( + "Feature flags: `~s`: the feature flag is " + "required on some nodes; list virgin nodes " + "to determine if the feature flag can simply " + "be marked as enabled", + [FeatureName], + #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}), + VirginNodesWhereDisabled = + lists:filter( + fun(Node) -> + case is_virgin_node(Node) of + IsVirgin when is_boolean(IsVirgin) -> + IsVirgin; + undefined -> + false + end + end, NodesWhereDisabled), + VirginNodesWhereDisabled =:= NodesWhereDisabled; + _ -> + false + end, + + case MarkDirectly of + false -> + case Stability of + required -> + ?LOG_DEBUG( + "Feature flags: `~s`: some nodes where the feature " + "flag is disabled are not virgin, we need to perform " + "a regular sync", + [FeatureName], + #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}); + _ -> + ok + end, + update_feature_state_and_enable(Inventory, FeatureName); + true -> + ?LOG_DEBUG( + "Feature flags: `~s`: all nodes where the feature flag is " + "disabled are virgin, we can directly mark it as enabled " + "there", + [FeatureName], + #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}), + mark_as_enabled_on_nodes( + NodesWhereDisabled, Inventory, FeatureName, true) + end. + -spec update_feature_state_and_enable(Inventory, FeatureName) -> Ret when Inventory :: rabbit_feature_flags:cluster_inventory(), FeatureName :: rabbit_feature_flags:feature_name(), @@ -646,6 +711,14 @@ update_feature_state_and_enable( Error end. +is_virgin_node(Node) -> + case rpc_call(Node, rabbit_mnesia, is_virgin_node, [], ?TIMEOUT) of + IsVirgin when is_boolean(IsVirgin) -> + IsVirgin; + {error, _} -> + undefined + end. + -spec do_enable(Inventory, FeatureName, Nodes) -> Ret when Inventory :: rabbit_feature_flags:cluster_inventory(), FeatureName :: rabbit_feature_flags:feature_name(), @@ -740,7 +813,7 @@ collect_inventory_on_nodes(Nodes, Timeout) -> {ok, #{feature_flags := FeatureFlags, applications_per_node := ScannedAppsPerNode, states_per_node := StatesPerNode} = Inventory}) -> - FeatureFlags2 = maps:merge(FeatureFlags, FeatureFlags1), + FeatureFlags2 = merge_feature_flags(FeatureFlags, FeatureFlags1), ScannedAppsPerNode1 = ScannedAppsPerNode#{Node => ScannedApps}, StatesPerNode1 = StatesPerNode#{Node => FeatureStates}, Inventory1 = Inventory#{ @@ -756,6 +829,32 @@ collect_inventory_on_nodes(Nodes, Timeout) -> Error end, {ok, Inventory0}, Rets). +merge_feature_flags(FeatureFlagsA, FeatureFlagsB) -> + FeatureFlags = maps:merge(FeatureFlagsA, FeatureFlagsB), + maps:map( + fun(FeatureName, FeatureProps) -> + FeaturePropsA = maps:get(FeatureName, FeatureFlagsA, #{}), + FeaturePropsB = maps:get(FeatureName, FeatureFlagsB, #{}), + + %% There is a rank between stability levels. If a feature flag + %% is required somewhere, it is required globally. Otherwise if + %% it is stable somewhere, it is stable globally. + StabilityA = rabbit_feature_flags:get_stability(FeaturePropsA), + StabilityB = rabbit_feature_flags:get_stability(FeaturePropsB), + Stability = case {StabilityA, StabilityB} of + {required, _} -> required; + {_, required} -> required; + {stable, _} -> stable; + {_, stable} -> stable; + _ -> experimental + end, + + FeatureProps1 = FeatureProps#{stability => Stability}, + FeatureProps2 = maps:remove(migration_fun, FeatureProps1), + FeatureProps3 = maps:remove(callbacks, FeatureProps2), + FeatureProps3 + end, FeatureFlags). + -spec list_feature_flags_enabled_somewhere(Inventory, HandleStateChanging) -> Ret when Inventory :: rabbit_feature_flags:cluster_inventory(), diff --git a/deps/rabbit/src/rabbit_mnesia.erl b/deps/rabbit/src/rabbit_mnesia.erl index ced7fa14b2ab..f90d8f09edaa 100644 --- a/deps/rabbit/src/rabbit_mnesia.erl +++ b/deps/rabbit/src/rabbit_mnesia.erl @@ -25,6 +25,7 @@ is_registered_process_alive/1, cluster_nodes/1, node_type/0, + is_virgin_node/0, dir/0, cluster_status_from_mnesia/0, diff --git a/deps/rabbit/test/feature_flags_v2_SUITE.erl b/deps/rabbit/test/feature_flags_v2_SUITE.erl index 66e459eb4d2e..e72d9a465fb5 100644 --- a/deps/rabbit/test/feature_flags_v2_SUITE.erl +++ b/deps/rabbit/test/feature_flags_v2_SUITE.erl @@ -39,7 +39,9 @@ enable_feature_flag_in_cluster_and_add_member_concurrently_mfv2/1, enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv1/1, enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2/1, - enable_feature_flag_with_post_enable/1 + enable_feature_flag_with_post_enable/1, + have_required_feature_flag_in_cluster_and_add_member_with_it_disabled/1, + have_required_feature_flag_in_cluster_and_add_member_without_it/1 ]). suite() -> @@ -70,7 +72,9 @@ groups() -> enable_feature_flag_in_cluster_and_add_member_concurrently_mfv2, enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv1, enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2, - enable_feature_flag_with_post_enable + enable_feature_flag_with_post_enable, + have_required_feature_flag_in_cluster_and_add_member_with_it_disabled, + have_required_feature_flag_in_cluster_and_add_member_without_it ]} ], [ @@ -145,7 +149,7 @@ start_slave_node(Parent, Config, Testcase, N) -> TestCodePath = filename:dirname(code:which(?MODULE)), true = rpc:call(Node, code, add_path, [TestCodePath]), - ok = run_on_node(Node, fun setup_slave_node/1, [Config]), + ok = run_on_node(Node, fun setup_slave_node/2, [Config, Testcase]), ct:pal("- Slave node `~ts` configured", [Node]), Parent ! {node, self(), Node}. @@ -177,24 +181,53 @@ run_on_node(Node, Fun, Args) -> %% Slave node configuration. %% ------------------------------------------------------------------- -setup_slave_node(Config) -> +setup_slave_node(Config, Testcase) -> ok = setup_logger(), + ok = setup_data_dir(Config), ok = setup_feature_flags_file(Config), ok = start_controller(), - ok = maybe_enable_feature_flags_v2(Config), + case Testcase of + have_required_feature_flag_in_cluster_and_add_member_with_it_disabled + -> + %% This testcase handles the `feature_flags_v2' state itself. + ok; + have_required_feature_flag_in_cluster_and_add_member_without_it + -> + %% This testcase handles the `feature_flags_v2' state itself. + ok; + _ -> + ok = maybe_enable_feature_flags_v2(Config) + end, ok. setup_logger() -> logger:set_primary_config(level, debug), ok. +setup_data_dir(Config) -> + %% The data directory is set to a specific location in the test log + %% directory. + PrivDir = ?config(priv_dir, Config), + DataDir = filename:join( + PrivDir, + rabbit_misc:format("data-~ts", [node()])), + ?LOG_INFO("Setting `data_dir` to \"~ts\"", [DataDir]), + case application:load(rabbit) of + ok -> ok; + {error, {already_loaded, _}} -> ok + end, + ok = application:set_env(rabbit, data_dir, DataDir), + ok = application:set_env(mnesia, dir, DataDir), + ok. + setup_feature_flags_file(Config) -> %% The `feature_flags_file' is set to a specific location in the test log %% directory. + PrivDir = ?config(priv_dir, Config), FeatureFlagsFile = filename:join( - ?config(priv_dir, Config), + PrivDir, rabbit_misc:format("feature_flags-~ts", [node()])), - ?LOG_INFO("Setting `feature_flags_file to \"~ts\"", [FeatureFlagsFile]), + ?LOG_INFO("Setting `feature_flags_file` to \"~ts\"", [FeatureFlagsFile]), case application:load(rabbit) of ok -> ok; {error, {already_loaded, _}} -> ok @@ -1534,3 +1567,209 @@ enable_feature_flag_with_post_enable(Config) -> []) || Node <- AllNodes], ok. + +have_required_feature_flag_in_cluster_and_add_member_with_it_disabled( + Config) -> + AllNodes = [NewNode | [FirstNode | _] = Nodes] = ?config(nodes, Config), + connect_nodes(Nodes), + override_running_nodes([NewNode]), + override_running_nodes(Nodes), + + FeatureName = ?FUNCTION_NAME, + FeatureFlags = #{FeatureName => + #{provided_by => ?MODULE, + stability => stable}}, + RequiredFeatureFlags = #{FeatureName => + #{provided_by => ?MODULE, + stability => required}}, + inject_on_nodes([NewNode], FeatureFlags), + inject_on_nodes(Nodes, RequiredFeatureFlags), + + ct:pal( + "Checking the feature flag is supported everywhere but enabled on the " + "existing cluster only"), + ok = run_on_node( + NewNode, + fun() -> + ?assert(rabbit_feature_flags:is_supported(FeatureName)), + ?assertNot(rabbit_feature_flags:is_enabled(FeatureName)), + + ok = maybe_enable_feature_flags_v2(Config), + ok + end, + []), + _ = [ok = + run_on_node( + Node, + fun() -> + ?assert(rabbit_feature_flags:is_supported(FeatureName)), + ?assert(rabbit_feature_flags:is_enabled(FeatureName)), + + %% We enable `feature_flags_v2' on the cluster, regardless + %% of the common_test group. The reason is that we want to + %% test that a bug in the sync between a + %% feature_flags_v2-based cluster and a + %% feature_flags_v1-based joining virgin node is fixed. + %% + %% See: + %% https://github.com/rabbitmq/rabbitmq-server/pull/6791 + ok = rabbit_feature_flags:enable(feature_flags_v2), + ok + end, + []) + || Node <- Nodes], + + %% Check compatibility between NewNodes and Nodes. + ok = run_on_node( + NewNode, + fun() -> + ?assertEqual( + ok, + rabbit_feature_flags:check_node_compatibility( + FirstNode)), + ok + end, []), + + %% Add node to cluster and synchronize feature flags. + connect_nodes(AllNodes), + override_running_nodes(AllNodes), + ct:pal( + "Synchronizing feature flags in the expanded cluster~n" + "~n" + "NOTE: Error messages about crashed migration functions can be " + "ignored for feature~n" + " flags other than `~ts`~n" + " because they assume they run inside RabbitMQ.", + [FeatureName]), + ok = run_on_node( + NewNode, + fun() -> + ?assertEqual( + ok, + rabbit_feature_flags:sync_feature_flags_with_cluster( + Nodes, true)), + ok + end, []), + + ct:pal("Checking the feature flag is enabled in the expanded cluster"), + _ = [ok = + run_on_node( + Node, + fun() -> + ?assert(rabbit_feature_flags:is_enabled(FeatureName)), + ok + end, + []) + || Node <- AllNodes], + ok. + +have_required_feature_flag_in_cluster_and_add_member_without_it( + Config) -> + AllNodes = [NewNode | [FirstNode | _] = Nodes] = ?config(nodes, Config), + connect_nodes(Nodes), + override_running_nodes([NewNode]), + override_running_nodes(Nodes), + + FeatureName = ?FUNCTION_NAME, + FeatureFlags = #{FeatureName => + #{provided_by => ?MODULE, + stability => stable}}, + RequiredFeatureFlags = #{FeatureName => + #{provided_by => ?MODULE, + stability => required}}, + inject_on_nodes([NewNode], FeatureFlags), + inject_on_nodes(Nodes, RequiredFeatureFlags), + + ct:pal( + "Checking the feature flag is supported and enabled on existing the " + "cluster only"), + ok = run_on_node( + NewNode, + fun() -> + ?assert(rabbit_feature_flags:is_supported(FeatureName)), + ?assertNot(rabbit_feature_flags:is_enabled(FeatureName)), + + ok = maybe_enable_feature_flags_v2(Config), + + MnesiaDir = rabbit_mnesia:dir(), + ok = filelib:ensure_path(MnesiaDir), + SomeFile = filename:join(MnesiaDir, "some-mnesia-file.db"), + ok = file:write_file(SomeFile, <<>>), + ?assertNot(rabbit_mnesia:is_virgin_node()), + ok + end, + []), + _ = [ok = + run_on_node( + Node, + fun() -> + ?assert(rabbit_feature_flags:is_supported(FeatureName)), + ?assert(rabbit_feature_flags:is_enabled(FeatureName)), + + %% We enable `feature_flags_v2' on the cluster, regardless + %% of the common_test group. The reason is that we want to + %% test that a bug in the sync between a + %% feature_flags_v2-based cluster and a + %% feature_flags_v1-based joining virgin node is fixed. + %% + %% See: + %% https://github.com/rabbitmq/rabbitmq-server/pull/6791 + ok = rabbit_feature_flags:enable(feature_flags_v2), + ok + end, + []) + || Node <- Nodes], + + %% Check compatibility between NewNodes and Nodes. + ok = run_on_node( + NewNode, + fun() -> + ?assertEqual( + ok, + rabbit_feature_flags:check_node_compatibility( + FirstNode)), + ok + end, []), + + %% Add node to cluster and synchronize feature flags. + connect_nodes(AllNodes), + override_running_nodes(AllNodes), + ct:pal( + "Synchronizing feature flags in the expanded cluster~n" + "~n" + "NOTE: Error messages about crashed migration functions can be " + "ignored for feature~n" + " flags other than `~ts`~n" + " because they assume they run inside RabbitMQ.", + [FeatureName]), + ok = run_on_node( + NewNode, + fun() -> + ?assertMatch( + {error, + {badrpc, + {'EXIT', + {{assertNotEqual, + [{module, rabbit_ff_registry_factory}, + {line, _}, + {expression, "State"}, + {value, state_changing}]}, + _}}}}, + rabbit_feature_flags:sync_feature_flags_with_cluster( + Nodes, false)), + ok + end, []), + + ct:pal("Checking the feature flag state is unchanged"), + _ = [ok = + run_on_node( + Node, + fun() -> + ?assertEqual( + Node =/= NewNode, + rabbit_feature_flags:is_enabled(FeatureName)), + ok + end, + []) + || Node <- AllNodes], + ok.