diff --git a/deps/rabbit/src/rabbit_feature_flags.erl b/deps/rabbit/src/rabbit_feature_flags.erl index f68bd5e4b4d0..d61a882c4a7e 100644 --- a/deps/rabbit/src/rabbit_feature_flags.erl +++ b/deps/rabbit/src/rabbit_feature_flags.erl @@ -1872,7 +1872,59 @@ sync_cluster_v1([], NodeIsVirgin, _) -> "current state"), ok end; -sync_cluster_v1(Nodes, _, Timeout) -> +sync_cluster_v1(Nodes, true = _NodeIsVirgin, Timeout) -> + [Node | _] = Nodes, + rabbit_log_feature_flags:debug( + "Feature flags: marking feature flags required remotely as enabled " + "locally because this node is virgin; using ~s as the remote node", + [Node]), + ?assertNotEqual(node(), Node), + Ret1 = rpc:call(Node, ?MODULE, list, [enabled], Timeout), + case Ret1 of + FeatureFlags when is_map(FeatureFlags) -> + %% We only consider required feature flags (for which there is no + %% migration code on nodes where it is required). Remotely enabled + %% feature flags will be handled by the regular sync. + RequiredFeatureFlags = + maps:filter( + fun(_, FeatureProps) -> + required =:= get_stability(FeatureProps) + end, FeatureFlags), + FeatureNames = lists:sort(maps:keys(RequiredFeatureFlags)), + rabbit_log_feature_flags:error( + "Feature flags: list of feature flags to automatically mark as " + "enabled on this virgin node: ~0p", + [FeatureNames]), + + %% We mark them as enabled directly (no migration code is executed) + %% because this node is virgin (i.e. no files on disk) and will + %% inherit the database from remote nodes. + Ret2 = + lists:foldl( + fun + (FeatureName, ok) -> + case is_supported_locally(FeatureName) of + true -> mark_as_enabled_locally(FeatureName, true); + false -> ok + end; + (_, Acc) -> + Acc + end, ok, FeatureNames), + + %% We continue with the regular sync process. It will handle other + %% enabled feature flags and unsupported ones too. + case Ret2 of + ok -> sync_feature_flags_with_cluster(Nodes, false, Timeout); + _ -> Ret2 + end; + {badrpc, _} = Reason -> + rabbit_log_feature_flags:error( + "Feature flags: failed to query remotely enabled feature flags " + "on node ~s: ~p", + [Node, Reason]), + {error, Reason} + end; +sync_cluster_v1(Nodes, _NodeIsVirgin, Timeout) -> case sync_feature_flags_v2_first(Nodes, Timeout) of true -> ?LOG_DEBUG( diff --git a/deps/rabbit/src/rabbit_ff_controller.erl b/deps/rabbit/src/rabbit_ff_controller.erl index c315dd34c6b8..a4a796e7edb9 100644 --- a/deps/rabbit/src/rabbit_ff_controller.erl +++ b/deps/rabbit/src/rabbit_ff_controller.erl @@ -577,7 +577,7 @@ enable_with_registry_locked( [FeatureName], #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}), - case update_feature_state_and_enable(Inventory, FeatureName) of + case check_required_and_enable(Inventory, FeatureName) of {ok, _Inventory} = Ok -> ?LOG_NOTICE( "Feature flags: `~ts` enabled", @@ -593,6 +593,71 @@ enable_with_registry_locked( end end. +-spec check_required_and_enable(Inventory, FeatureName) -> Ret when + Inventory :: rabbit_feature_flags:cluster_inventory(), + FeatureName :: rabbit_feature_flags:feature_name(), + Ret :: {ok, Inventory} | {error, Reason}, + Reason :: term(). + +check_required_and_enable( + #{feature_flags := FeatureFlags, + states_per_node := _} = Inventory, + FeatureName) -> + %% Required feature flags vs. virgin nodes. + FeatureProps = maps:get(FeatureName, FeatureFlags), + Stability = rabbit_feature_flags:get_stability(FeatureProps), + NodesWhereDisabled = list_nodes_where_feature_flag_is_disabled( + Inventory, FeatureName), + + MarkDirectly = case Stability of + required -> + ?LOG_DEBUG( + "Feature flags: `~s`: the feature flag is " + "required on some nodes; list virgin nodes " + "to determine if the feature flag can simply " + "be marked as enabled", + [FeatureName], + #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}), + VirginNodesWhereDisabled = + lists:filter( + fun(Node) -> + case is_virgin_node(Node) of + IsVirgin when is_boolean(IsVirgin) -> + IsVirgin; + undefined -> + false + end + end, NodesWhereDisabled), + VirginNodesWhereDisabled =:= NodesWhereDisabled; + _ -> + false + end, + + case MarkDirectly of + false -> + case Stability of + required -> + ?LOG_DEBUG( + "Feature flags: `~s`: some nodes where the feature " + "flag is disabled are not virgin, we need to perform " + "a regular sync", + [FeatureName], + #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}); + _ -> + ok + end, + update_feature_state_and_enable(Inventory, FeatureName); + true -> + ?LOG_DEBUG( + "Feature flags: `~s`: all nodes where the feature flag is " + "disabled are virgin, we can directly mark it as enabled " + "there", + [FeatureName], + #{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}), + mark_as_enabled_on_nodes( + NodesWhereDisabled, Inventory, FeatureName, true) + end. + -spec update_feature_state_and_enable(Inventory, FeatureName) -> Ret when Inventory :: rabbit_feature_flags:cluster_inventory(), FeatureName :: rabbit_feature_flags:feature_name(), @@ -646,6 +711,14 @@ update_feature_state_and_enable( Error end. +is_virgin_node(Node) -> + case rpc_call(Node, rabbit_mnesia, is_virgin_node, [], ?TIMEOUT) of + IsVirgin when is_boolean(IsVirgin) -> + IsVirgin; + {error, _} -> + undefined + end. + -spec do_enable(Inventory, FeatureName, Nodes) -> Ret when Inventory :: rabbit_feature_flags:cluster_inventory(), FeatureName :: rabbit_feature_flags:feature_name(), @@ -740,7 +813,7 @@ collect_inventory_on_nodes(Nodes, Timeout) -> {ok, #{feature_flags := FeatureFlags, applications_per_node := ScannedAppsPerNode, states_per_node := StatesPerNode} = Inventory}) -> - FeatureFlags2 = maps:merge(FeatureFlags, FeatureFlags1), + FeatureFlags2 = merge_feature_flags(FeatureFlags, FeatureFlags1), ScannedAppsPerNode1 = ScannedAppsPerNode#{Node => ScannedApps}, StatesPerNode1 = StatesPerNode#{Node => FeatureStates}, Inventory1 = Inventory#{ @@ -756,6 +829,32 @@ collect_inventory_on_nodes(Nodes, Timeout) -> Error end, {ok, Inventory0}, Rets). +merge_feature_flags(FeatureFlagsA, FeatureFlagsB) -> + FeatureFlags = maps:merge(FeatureFlagsA, FeatureFlagsB), + maps:map( + fun(FeatureName, FeatureProps) -> + FeaturePropsA = maps:get(FeatureName, FeatureFlagsA, #{}), + FeaturePropsB = maps:get(FeatureName, FeatureFlagsB, #{}), + + %% There is a rank between stability levels. If a feature flag + %% is required somewhere, it is required globally. Otherwise if + %% it is stable somewhere, it is stable globally. + StabilityA = rabbit_feature_flags:get_stability(FeaturePropsA), + StabilityB = rabbit_feature_flags:get_stability(FeaturePropsB), + Stability = case {StabilityA, StabilityB} of + {required, _} -> required; + {_, required} -> required; + {stable, _} -> stable; + {_, stable} -> stable; + _ -> experimental + end, + + FeatureProps1 = FeatureProps#{stability => Stability}, + FeatureProps2 = maps:remove(migration_fun, FeatureProps1), + FeatureProps3 = maps:remove(callbacks, FeatureProps2), + FeatureProps3 + end, FeatureFlags). + -spec list_feature_flags_enabled_somewhere(Inventory, HandleStateChanging) -> Ret when Inventory :: rabbit_feature_flags:cluster_inventory(), diff --git a/deps/rabbit/src/rabbit_mnesia.erl b/deps/rabbit/src/rabbit_mnesia.erl index ced7fa14b2ab..f90d8f09edaa 100644 --- a/deps/rabbit/src/rabbit_mnesia.erl +++ b/deps/rabbit/src/rabbit_mnesia.erl @@ -25,6 +25,7 @@ is_registered_process_alive/1, cluster_nodes/1, node_type/0, + is_virgin_node/0, dir/0, cluster_status_from_mnesia/0, diff --git a/deps/rabbit/test/feature_flags_v2_SUITE.erl b/deps/rabbit/test/feature_flags_v2_SUITE.erl index 66e459eb4d2e..e72d9a465fb5 100644 --- a/deps/rabbit/test/feature_flags_v2_SUITE.erl +++ b/deps/rabbit/test/feature_flags_v2_SUITE.erl @@ -39,7 +39,9 @@ enable_feature_flag_in_cluster_and_add_member_concurrently_mfv2/1, enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv1/1, enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2/1, - enable_feature_flag_with_post_enable/1 + enable_feature_flag_with_post_enable/1, + have_required_feature_flag_in_cluster_and_add_member_with_it_disabled/1, + have_required_feature_flag_in_cluster_and_add_member_without_it/1 ]). suite() -> @@ -70,7 +72,9 @@ groups() -> enable_feature_flag_in_cluster_and_add_member_concurrently_mfv2, enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv1, enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2, - enable_feature_flag_with_post_enable + enable_feature_flag_with_post_enable, + have_required_feature_flag_in_cluster_and_add_member_with_it_disabled, + have_required_feature_flag_in_cluster_and_add_member_without_it ]} ], [ @@ -145,7 +149,7 @@ start_slave_node(Parent, Config, Testcase, N) -> TestCodePath = filename:dirname(code:which(?MODULE)), true = rpc:call(Node, code, add_path, [TestCodePath]), - ok = run_on_node(Node, fun setup_slave_node/1, [Config]), + ok = run_on_node(Node, fun setup_slave_node/2, [Config, Testcase]), ct:pal("- Slave node `~ts` configured", [Node]), Parent ! {node, self(), Node}. @@ -177,24 +181,53 @@ run_on_node(Node, Fun, Args) -> %% Slave node configuration. %% ------------------------------------------------------------------- -setup_slave_node(Config) -> +setup_slave_node(Config, Testcase) -> ok = setup_logger(), + ok = setup_data_dir(Config), ok = setup_feature_flags_file(Config), ok = start_controller(), - ok = maybe_enable_feature_flags_v2(Config), + case Testcase of + have_required_feature_flag_in_cluster_and_add_member_with_it_disabled + -> + %% This testcase handles the `feature_flags_v2' state itself. + ok; + have_required_feature_flag_in_cluster_and_add_member_without_it + -> + %% This testcase handles the `feature_flags_v2' state itself. + ok; + _ -> + ok = maybe_enable_feature_flags_v2(Config) + end, ok. setup_logger() -> logger:set_primary_config(level, debug), ok. +setup_data_dir(Config) -> + %% The data directory is set to a specific location in the test log + %% directory. + PrivDir = ?config(priv_dir, Config), + DataDir = filename:join( + PrivDir, + rabbit_misc:format("data-~ts", [node()])), + ?LOG_INFO("Setting `data_dir` to \"~ts\"", [DataDir]), + case application:load(rabbit) of + ok -> ok; + {error, {already_loaded, _}} -> ok + end, + ok = application:set_env(rabbit, data_dir, DataDir), + ok = application:set_env(mnesia, dir, DataDir), + ok. + setup_feature_flags_file(Config) -> %% The `feature_flags_file' is set to a specific location in the test log %% directory. + PrivDir = ?config(priv_dir, Config), FeatureFlagsFile = filename:join( - ?config(priv_dir, Config), + PrivDir, rabbit_misc:format("feature_flags-~ts", [node()])), - ?LOG_INFO("Setting `feature_flags_file to \"~ts\"", [FeatureFlagsFile]), + ?LOG_INFO("Setting `feature_flags_file` to \"~ts\"", [FeatureFlagsFile]), case application:load(rabbit) of ok -> ok; {error, {already_loaded, _}} -> ok @@ -1534,3 +1567,209 @@ enable_feature_flag_with_post_enable(Config) -> []) || Node <- AllNodes], ok. + +have_required_feature_flag_in_cluster_and_add_member_with_it_disabled( + Config) -> + AllNodes = [NewNode | [FirstNode | _] = Nodes] = ?config(nodes, Config), + connect_nodes(Nodes), + override_running_nodes([NewNode]), + override_running_nodes(Nodes), + + FeatureName = ?FUNCTION_NAME, + FeatureFlags = #{FeatureName => + #{provided_by => ?MODULE, + stability => stable}}, + RequiredFeatureFlags = #{FeatureName => + #{provided_by => ?MODULE, + stability => required}}, + inject_on_nodes([NewNode], FeatureFlags), + inject_on_nodes(Nodes, RequiredFeatureFlags), + + ct:pal( + "Checking the feature flag is supported everywhere but enabled on the " + "existing cluster only"), + ok = run_on_node( + NewNode, + fun() -> + ?assert(rabbit_feature_flags:is_supported(FeatureName)), + ?assertNot(rabbit_feature_flags:is_enabled(FeatureName)), + + ok = maybe_enable_feature_flags_v2(Config), + ok + end, + []), + _ = [ok = + run_on_node( + Node, + fun() -> + ?assert(rabbit_feature_flags:is_supported(FeatureName)), + ?assert(rabbit_feature_flags:is_enabled(FeatureName)), + + %% We enable `feature_flags_v2' on the cluster, regardless + %% of the common_test group. The reason is that we want to + %% test that a bug in the sync between a + %% feature_flags_v2-based cluster and a + %% feature_flags_v1-based joining virgin node is fixed. + %% + %% See: + %% https://github.com/rabbitmq/rabbitmq-server/pull/6791 + ok = rabbit_feature_flags:enable(feature_flags_v2), + ok + end, + []) + || Node <- Nodes], + + %% Check compatibility between NewNodes and Nodes. + ok = run_on_node( + NewNode, + fun() -> + ?assertEqual( + ok, + rabbit_feature_flags:check_node_compatibility( + FirstNode)), + ok + end, []), + + %% Add node to cluster and synchronize feature flags. + connect_nodes(AllNodes), + override_running_nodes(AllNodes), + ct:pal( + "Synchronizing feature flags in the expanded cluster~n" + "~n" + "NOTE: Error messages about crashed migration functions can be " + "ignored for feature~n" + " flags other than `~ts`~n" + " because they assume they run inside RabbitMQ.", + [FeatureName]), + ok = run_on_node( + NewNode, + fun() -> + ?assertEqual( + ok, + rabbit_feature_flags:sync_feature_flags_with_cluster( + Nodes, true)), + ok + end, []), + + ct:pal("Checking the feature flag is enabled in the expanded cluster"), + _ = [ok = + run_on_node( + Node, + fun() -> + ?assert(rabbit_feature_flags:is_enabled(FeatureName)), + ok + end, + []) + || Node <- AllNodes], + ok. + +have_required_feature_flag_in_cluster_and_add_member_without_it( + Config) -> + AllNodes = [NewNode | [FirstNode | _] = Nodes] = ?config(nodes, Config), + connect_nodes(Nodes), + override_running_nodes([NewNode]), + override_running_nodes(Nodes), + + FeatureName = ?FUNCTION_NAME, + FeatureFlags = #{FeatureName => + #{provided_by => ?MODULE, + stability => stable}}, + RequiredFeatureFlags = #{FeatureName => + #{provided_by => ?MODULE, + stability => required}}, + inject_on_nodes([NewNode], FeatureFlags), + inject_on_nodes(Nodes, RequiredFeatureFlags), + + ct:pal( + "Checking the feature flag is supported and enabled on existing the " + "cluster only"), + ok = run_on_node( + NewNode, + fun() -> + ?assert(rabbit_feature_flags:is_supported(FeatureName)), + ?assertNot(rabbit_feature_flags:is_enabled(FeatureName)), + + ok = maybe_enable_feature_flags_v2(Config), + + MnesiaDir = rabbit_mnesia:dir(), + ok = filelib:ensure_path(MnesiaDir), + SomeFile = filename:join(MnesiaDir, "some-mnesia-file.db"), + ok = file:write_file(SomeFile, <<>>), + ?assertNot(rabbit_mnesia:is_virgin_node()), + ok + end, + []), + _ = [ok = + run_on_node( + Node, + fun() -> + ?assert(rabbit_feature_flags:is_supported(FeatureName)), + ?assert(rabbit_feature_flags:is_enabled(FeatureName)), + + %% We enable `feature_flags_v2' on the cluster, regardless + %% of the common_test group. The reason is that we want to + %% test that a bug in the sync between a + %% feature_flags_v2-based cluster and a + %% feature_flags_v1-based joining virgin node is fixed. + %% + %% See: + %% https://github.com/rabbitmq/rabbitmq-server/pull/6791 + ok = rabbit_feature_flags:enable(feature_flags_v2), + ok + end, + []) + || Node <- Nodes], + + %% Check compatibility between NewNodes and Nodes. + ok = run_on_node( + NewNode, + fun() -> + ?assertEqual( + ok, + rabbit_feature_flags:check_node_compatibility( + FirstNode)), + ok + end, []), + + %% Add node to cluster and synchronize feature flags. + connect_nodes(AllNodes), + override_running_nodes(AllNodes), + ct:pal( + "Synchronizing feature flags in the expanded cluster~n" + "~n" + "NOTE: Error messages about crashed migration functions can be " + "ignored for feature~n" + " flags other than `~ts`~n" + " because they assume they run inside RabbitMQ.", + [FeatureName]), + ok = run_on_node( + NewNode, + fun() -> + ?assertMatch( + {error, + {badrpc, + {'EXIT', + {{assertNotEqual, + [{module, rabbit_ff_registry_factory}, + {line, _}, + {expression, "State"}, + {value, state_changing}]}, + _}}}}, + rabbit_feature_flags:sync_feature_flags_with_cluster( + Nodes, false)), + ok + end, []), + + ct:pal("Checking the feature flag state is unchanged"), + _ = [ok = + run_on_node( + Node, + fun() -> + ?assertEqual( + Node =/= NewNode, + rabbit_feature_flags:is_enabled(FeatureName)), + ok + end, + []) + || Node <- AllNodes], + ok.