[syncBN]

jjsjann123 · jjsjann123 · commit fa719e8b2505 · 2019-01-02T14:02:20.000-08:00
replacing new_group with torch.distributed.group.WORLD, avoids creating new group in every iteration. This should resolve the issue in Training gets stuck when using SyncBN pytorch#105
diff --git a/apex/parallel/__init__.py b/apex/parallel/__init__.py
@@ -1,14 +1,5 @@
 import torch
 
-# Backward compatibility hack around
-# https://github.com/pytorch/pytorch/pull/14767
-if hasattr(torch.distributed, 'get_default_group'):
-    group_creator = torch.distributed.get_default_group
-elif hasattr(torch.distributed, 'new_group'):
-    group_creator = torch.distributed.new_group
-else:
-    group_creator = torch.distributed.deprecated.new_group
-
 if hasattr(torch.distributed, 'ReduceOp'):
     ReduceOp = torch.distributed.ReduceOp
 elif hasattr(torch.distributed, 'reduce_op'):
diff --git a/apex/parallel/optimized_sync_batchnorm_kernel.py b/apex/parallel/optimized_sync_batchnorm_kernel.py
@@ -2,7 +2,7 @@
 from torch.autograd.function import Function
 
 import syncbn
-from apex.parallel import group_creator, ReduceOp
+from apex.parallel import ReduceOp
 
 class SyncBatchnormFunction(Function):
 
@@ -16,11 +16,9 @@ def forward(ctx, input, weight, bias, running_mean, running_variance, eps, track
             mean, var, var_biased = syncbn.welford_mean_var(input)
 
             if torch.distributed.is_initialized():
-                if process_group:
-                    world_size = torch.distributed.get_world_size(process_group)
-                else:
-                    process_group = group_creator()
-                    world_size = torch.distributed.get_world_size()
+                if not process_group:
+                    process_group = torch.distributed.group.WORLD
+                world_size = torch.distributed.get_world_size(process_group)
                 mean_all = torch.empty(world_size, mean.size(0), dtype=mean.dtype, device=mean.device)
                 var_all = torch.empty(world_size, var.size(0), dtype=var.dtype, device=var.device)
                 mean_l = [mean_all.narrow(0, i, 1) for i in range(world_size)]
diff --git a/apex/parallel/sync_batchnorm.py b/apex/parallel/sync_batchnorm.py
@@ -3,7 +3,7 @@
 from torch.nn import functional as F
 
 from .sync_batchnorm_kernel import SyncBatchnormFunction
-from apex.parallel import group_creator, ReduceOp
+from apex.parallel import ReduceOp
 
 
 class SyncBatchNorm(_BatchNorm):
@@ -63,11 +63,9 @@ def forward(self, input):
         else:
             process_group = self.process_group
             world_size = 0
-            if self.process_group:
-                world_size = torch.distributed.get_world_size(process_group)
-            else:
-                process_group = group_creator()
-                world_size = torch.distributed.get_world_size()
+            if not self.process_group:
+                process_group = torch.distributed.group.WORLD
+            world_size = torch.distributed.get_world_size(process_group)
             self.num_batches_tracked += 1
             with torch.no_grad():
                 channel_first_input = input.transpose(0, 1).contiguous()
diff --git a/apex/parallel/sync_batchnorm_kernel.py b/apex/parallel/sync_batchnorm_kernel.py
@@ -1,7 +1,7 @@
 import torch
 from torch.autograd.function import Function
 
-from apex.parallel import group_creator, ReduceOp
+from apex.parallel import ReduceOp
 
 
 class SyncBatchnormFunction(Function):