Group queue deletions on_node_down into 10 operations per transaction

gerhard · gerhard · commit 4927aeb648be · 2018-03-27T16:07:35.000+01:00
When many queues are being deleted, we believe that it's faster to have fewer Mnesia transactions and therefore group 10 queue deletions into a single Mnesia transaction. This number (10) is arbitrary, we didn't try with a different number. Creating 1 Mnesia transaction for every queue deletion feels too many transaction, and having a single Mnesia transaction for all queue deletions is too few transactions. This felt like a sensible option. We cannot determine if this is a good change because rabbit_core_metrics:queue_deleted/1 takes the most time and obscures all observations. According to qcachegrind, rabbit_misc:execute_mnesia_transaction/1 takes 1.8s while rabbit_core_metrics:queue_deleted/1 takes 132s out of which ets:select/2 takes 131s. How can we optimise rabbit_core_metrics:queue_deleted/1 ? We are thinking that rather than calling ets:select/2 twice for every queue, we should call it twice for all queues that need to be deleted. We don't know whether this is possible. Alternatively, we might look into ets:first/1 & ets:next/2 to iterate over the entire table ONCE with all the queues that have been deleted. Thoughts @dcorbacho @michaelklishin ? For initial context, see #1513 Partner-in-crime: @essen
diff --git a/src/rabbit_amqqueue.erl b/src/rabbit_amqqueue.erl
@@ -1114,28 +1114,20 @@ maybe_clear_recoverable_node(Node,
     end.
 
 on_node_down(Node) ->
-    % Create 1 transaction per N queues that need to be deleted
-    %   * 1 transaction for all queues might block everything for a really long time
-    %   * ^^^ this is what used to happen before this change
-    %   * 1 transaction per queue will result in too many transaction
-    %   * ^^^ this is what happens now; it's not perfect, but it's a step in the right direction
-    %   * ^^^ OPTIMISE THIS BEFORE MERGING ^^^
-    %   * Maybe 1 transaction for every 10 queues that need to be deleted ?
-    %
     % For each transaction:
     %   * delete all queues in the transaction
     %   * capture the result for every delete queue
     [
         rabbit_misc:execute_mnesia_tx_with_tail(
-          fun () -> Dels = [delete_queue(Q)],
-                    T = rabbit_binding:process_deletions(
+          fun () -> QueueDeletions = [delete_queue(Queue) || Queue <- Queues],
+                    NotifyBindingDeletions = rabbit_binding:process_deletions(
                           lists:foldl(fun rabbit_binding:combine_deletions/2,
-                                      rabbit_binding:new_deletions(), Dels),
+                                      rabbit_binding:new_deletions(), QueueDeletions),
                           ?INTERNAL_USER),
                     fun () ->
-                            T(),
+                            NotifyBindingDeletions(),
                             lists:foreach(
-                              fun(QName) ->
+                              fun(Queue) ->
                                       % When 40k queues are being deleted,
                                       % this results in a rabbit_node_monitor function that recurses for 30 minutes,
                                       % meaning that no information is available for the node (Management Overview doesn't update):
@@ -1154,14 +1146,20 @@ on_node_down(Node) ->
                                       %      [{file,"src/rabbit_node_monitor.erl"},{line,755}]},
                                       %  {rabbit_node_monitor,handle_info,2,
                                       %      [{file,"src/rabbit_node_monitor.erl"},{line,548}]}]}
-                                      rabbit_core_metrics:queue_deleted(QName),
+                                      rabbit_core_metrics:queue_deleted(Queue),
                                       ok = rabbit_event:notify(queue_deleted,
-                                                               [{name, QName},
+                                                               [{name, Queue},
                                                                 {user, ?INTERNAL_USER}])
-                              end, [Q])
+                              end, Queues)
                     end
-          end) || Q <- queues_to_delete_from_node_down(Node)
-    ].
+          end) || Queues <- partition_queues(queues_to_delete_from_node_down(Node))
+    ],
+    ok.
+
+partition_queues([Q0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9 | T]) ->
+    [[Q0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9] | partition_queues(T)];
+partition_queues(T) ->
+    [T].
 
 delete_queue(QueueName) ->
     ok = mnesia:delete({rabbit_queue, QueueName}),