[Gradient Compression] Error feedback for PowerSGD (still need to fix the key in error_dict) (#48670)

Yi Wang · facebook-github-bot · commit 9c6979a266b0 · 2020-12-02T06:39:30.000-08:00
Summary: Pull Request resolved: #48670 Support an optional error feedback for PowerSGD -- storing the difference (i.e., the local error caused by compression) between the input gradient (adjusted by the existing error) and the gradient after decompression, and reinserting it at the next iteration. Still need to add an index field to GradBucket as the key of error_dict. This is because the current key, input tensor of the bucket, can change across steps, as the buckets may be rebuilt in forward pass in order to save peak memory usage. This is halfway of error feedback. Plan to add the new index field in a separate PR. Original PR issue: Investigate Applying PowerSGD to Communication Hook for Gradient Compression #47202 ghstack-source-id: 117636492 Test Plan: buck test mode/dev-nosan caffe2/test/distributed:c10d -- test_powerSGD_ddp_comm_hook_nccl Reviewed By: rohan-varma Differential Revision: D25240290 fbshipit-source-id: 5b6e11e711caccfb8984ac2767dd107dbf4c9b3b
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
@@ -16,7 +16,12 @@ def _ddp_comm_hook_wrapper(comm_hook, model, state):
 
 
 def _powerSGD_comm_hook_wrapper(
-    comm_hook, model, state, matrix_approximation_rank, random_seed=0
+    comm_hook,
+    model,
+    state,
+    matrix_approximation_rank,
+    use_error_feedback=True,
+    random_seed=0,
 ):
     """
     To be consistent with the wrappers of other DDP comm hooks, the input state only needs to be a process group,
@@ -25,6 +30,7 @@ def _powerSGD_comm_hook_wrapper(
     powerSGD_state = powerSGD.PowerSGDState(
         process_group=state,
         matrix_approximation_rank=matrix_approximation_rank,
+        use_error_feedback=use_error_feedback,
         random_seed=random_seed,
     )
     model.register_comm_hook(powerSGD_state, comm_hook)
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -30,17 +30,44 @@ def _orthogonalize(matrix, epsilon=1e-8):
 
 
 class PowerSGDState(object):
-    __slots__ = ["process_group", "matrix_approximation_rank", "rng"]
-
-    def __init__(self, process_group, matrix_approximation_rank=1, random_seed=0):
+    __slots__ = [
+        "process_group",
+        "matrix_approximation_rank",
+        "use_error_feedback",
+        "rng",
+        "error_dict",
+    ]
+
+    def __init__(
+        self,
+        process_group,
+        matrix_approximation_rank=1,
+        use_error_feedback=True,
+        random_seed=0,
+    ):
         self.process_group = process_group
         self.matrix_approximation_rank = matrix_approximation_rank
+        # Error feedback is usually crucial for both for convergence and generalization,
+        # because PowerSGD is a biased compressor,
+        # i.e., compressing and decompressing a random gradient does not yield the original in expectation.
+        # This mechanism requires a temporary copy of the input gradients,
+        # so it increases the peak memory consumption by the size of gradient tensor.
+        # However, if the target matrices are known to be exactly low-ranked (instead of just low stable rank),
+        # sometimes it is possible to converge to the optima without error feedback.
+        # See: http://proceedings.mlr.press/v54/yurtsever17a/yurtsever17a.pdf
+        self.use_error_feedback = use_error_feedback
         # The purpose of this RNG is to generate different random seeds for initializing Q across iterations,
         # but in the same order for all the DDP replicas.
         # Different random seeds across iterations indicate different 'projections' of the gradients at different SGD steps.
         # If the same random projection is used,
         # there will be differences between the gradients that are never synchronized.
         self.rng = np.random.RandomState(random_seed)
+        # Since there is only a single state instance for all the input buckets,
+        # need to maintain a dictionary that maps each bucket to the local error.
+        # TODO(wayi): Currently the key is the (hashcode of) input tensor, which may change across steps,
+        # since the bucket can be rebuilt in the forward pass (to save peak memory usage).
+        # Need to add an index field to the input bucket of comm hook.
+        self.error_dict = {}
 
 
 def powerSGD_hook(
@@ -98,6 +125,17 @@ def powerSGD_hook(
     padded_total_length = square_side_length ** 2
     input_tensor.resize_(padded_total_length)
     input_tensor[total_length:padded_total_length].fill_(0)
+
+    # Incorporate the error from the previous state into the gradients.
+    if state.use_error_feedback:
+        if input_tensor in state.error_dict:
+            input_tensor.add_(state.error_dict[input_tensor])
+        else:
+            state.error_dict[input_tensor] = torch.zeros(padded_total_length, device=device)
+        # Keep a copy of the input tensor,
+        # so that we can compute the local error caused by compression later,
+        # by comparing this copy and the input tensor updated after decompression.
+        input_tensor_cp = torch.clone(input_tensor).detach()
     matrix = input_tensor.view(square_side_length, square_side_length)
 
     def create_low_rank_tensor(fill_random_values, rng):
@@ -141,6 +179,9 @@ def decompress(fut):
         q = fut.value()[0].div_(world_size)
         torch.matmul(p, q.t(), out=matrix)
 
+        if state.use_error_feedback:
+            # Memorize the local errors.
+            state.error_dict[input_tensor] = input_tensor_cp - input_tensor
         ret = input_tensor.resize_(total_length)
         return [ret]