Add device-specific cuFFT plan caches (pytorch#19300)

ssnl · facebook-github-bot · commit 973d51079b3c · 2019-04-18T06:39:35.000-07:00
Summary: Fixes pytorch#19224 Pull Request resolved: pytorch#19300 Differential Revision: D14986967 Pulled By: soumith fbshipit-source-id: 8c31237db50d6924bba1472434c10326610d9255
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -196,33 +196,33 @@ double CUDAHooks::batchnormMinEpsilonCuDNN() const {
 #endif
 }
 
-int64_t CUDAHooks::cuFFTGetPlanCacheMaxSize() const {
+int64_t CUDAHooks::cuFFTGetPlanCacheMaxSize(int64_t device_index) const {
 #ifndef __HIP_PLATFORM_HCC__
-  return at::native::detail::cufft_get_plan_cache_max_size_impl();
+  return at::native::detail::cufft_get_plan_cache_max_size_impl(device_index);
 #else
   AT_ERROR("cuFFT with HIP is not supported");
 #endif
 }
 
-void CUDAHooks::cuFFTSetPlanCacheMaxSize(int64_t max_size) const {
+void CUDAHooks::cuFFTSetPlanCacheMaxSize(int64_t device_index, int64_t max_size) const {
 #ifndef __HIP_PLATFORM_HCC__
-  at::native::detail::cufft_set_plan_cache_max_size_impl(max_size);
+  at::native::detail::cufft_set_plan_cache_max_size_impl(device_index, max_size);
 #else
   AT_ERROR("cuFFT with HIP is not supported");
 #endif
 }
 
-int64_t CUDAHooks::cuFFTGetPlanCacheSize() const {
+int64_t CUDAHooks::cuFFTGetPlanCacheSize(int64_t device_index) const {
 #ifndef __HIP_PLATFORM_HCC__
-  return at::native::detail::cufft_get_plan_cache_size_impl();
+  return at::native::detail::cufft_get_plan_cache_size_impl(device_index);
 #else
   AT_ERROR("cuFFT with HIP is not supported");
 #endif
 }
 
-void CUDAHooks::cuFFTClearPlanCache() const {
+void CUDAHooks::cuFFTClearPlanCache(int64_t device_index) const {
 #ifndef __HIP_PLATFORM_HCC__
-  at::native::detail::cufft_clear_plan_cache_impl();
+  at::native::detail::cufft_clear_plan_cache_impl(device_index);
 #else
   AT_ERROR("cuFFT with HIP is not supported");
 #endif
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -24,10 +24,10 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   long versionCuDNN() const override;
   std::string showConfig() const override;
   double batchnormMinEpsilonCuDNN() const override;
-  int64_t cuFFTGetPlanCacheMaxSize() const override;
-  void cuFFTSetPlanCacheMaxSize(int64_t max_size) const override;
-  int64_t cuFFTGetPlanCacheSize() const override;
-  void cuFFTClearPlanCache() const override;
+  int64_t cuFFTGetPlanCacheMaxSize(int64_t device_index) const override;
+  void cuFFTSetPlanCacheMaxSize(int64_t device_index, int64_t max_size) const override;
+  int64_t cuFFTGetPlanCacheSize(int64_t device_index) const override;
+  void cuFFTClearPlanCache(int64_t device_index) const override;
   int getNumGPUs() const override;
 };
 
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -111,19 +111,19 @@ struct CAFFE2_API CUDAHooksInterface {
         "Cannot query batchnormMinEpsilonCuDNN() without ATen_cuda library. ", CUDA_HELP);
   }
 
-  virtual int64_t cuFFTGetPlanCacheMaxSize() const {
+  virtual int64_t cuFFTGetPlanCacheMaxSize(int64_t device_index) const {
     AT_ERROR("Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);
   }
 
-  virtual void cuFFTSetPlanCacheMaxSize(int64_t max_size) const {
+  virtual void cuFFTSetPlanCacheMaxSize(int64_t device_index, int64_t max_size) const {
     AT_ERROR("Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);
   }
 
-  virtual int64_t cuFFTGetPlanCacheSize() const {
+  virtual int64_t cuFFTGetPlanCacheSize(int64_t device_index) const {
     AT_ERROR("Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);
   }
 
-  virtual void cuFFTClearPlanCache() const {
+  virtual void cuFFTClearPlanCache(int64_t device_index) const {
     AT_ERROR("Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);
   }
 
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
@@ -130,20 +130,20 @@ static inline Tensor _fft(const Tensor &self, const int64_t signal_ndim,
 
 // We call the following methods via CUDA hooks because they are really only
 // valid when CUDA is available. See native/cuda/CuFFTPlanCache.h for more details.
-int64_t _cufft_get_plan_cache_max_size() {
-  return detail::getCUDAHooks().cuFFTGetPlanCacheMaxSize();
+int64_t _cufft_get_plan_cache_max_size(int64_t device_index) {
+  return detail::getCUDAHooks().cuFFTGetPlanCacheMaxSize(device_index);
 }
 
-void _cufft_set_plan_cache_max_size(int64_t max_size) {
-  detail::getCUDAHooks().cuFFTSetPlanCacheMaxSize(max_size);
+void _cufft_set_plan_cache_max_size(int64_t device_index, int64_t max_size) {
+  detail::getCUDAHooks().cuFFTSetPlanCacheMaxSize(device_index, max_size);
 }
 
-int64_t _cufft_get_plan_cache_size() {
-  return detail::getCUDAHooks().cuFFTGetPlanCacheSize();
+int64_t _cufft_get_plan_cache_size(int64_t device_index) {
+  return detail::getCUDAHooks().cuFFTGetPlanCacheSize(device_index);
 }
 
-void _cufft_clear_plan_cache() {
-  detail::getCUDAHooks().cuFFTClearPlanCache();
+void _cufft_clear_plan_cache(int64_t device_index) {
+  detail::getCUDAHooks().cuFFTClearPlanCache(device_index);
 }
 
 Tensor fft(const Tensor& self, const int64_t signal_ndim, const bool normalized) {
diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
@@ -373,6 +373,18 @@ class CuFFTParamsLRUCache {
     _set_max_size(max_size);
   }
 
+  CuFFTParamsLRUCache(CuFFTParamsLRUCache&& other) noexcept :
+    _usage_list(std::move(other._usage_list)),
+    _cache_map(std::move(other._cache_map)),
+    _max_size(other._max_size) {}
+
+  CuFFTParamsLRUCache& operator=(CuFFTParamsLRUCache&& other) noexcept {
+    _usage_list = std::move(other._usage_list);
+    _cache_map = std::move(other._cache_map);
+    _max_size = other._max_size;
+    return *this;
+  }
+
   // If key is in this cache, return the cached config. Otherwise, emplace the
   // config in this cache using value_args and return it.
   // Return const reference because CuFFTConfig shouldn't be tampered with once
@@ -431,6 +443,8 @@ class CuFFTParamsLRUCache {
 
   size_t max_size() const noexcept { return _max_size; }
 
+  std::mutex mutex;
+
 private:
   // Only sets size and does value check. Does not resize the data structures.
   void _set_max_size(int64_t new_size) {
@@ -455,9 +469,9 @@ class CuFFTParamsLRUCache {
 // native function counterparts (at native/SpectralOps.cpp), i.e.,
 // _cufft_get_plan_cache_max_size, _cufft_set_plan_cache_max_size
 // _cufft_get_plan_cache_size, and _cufft_clear_plan_cache.
-int64_t cufft_get_plan_cache_max_size_impl();
-void cufft_set_plan_cache_max_size_impl(int64_t max_size);
-int64_t cufft_get_plan_cache_size_impl();
-void cufft_clear_plan_cache_impl();
+int64_t cufft_get_plan_cache_max_size_impl(int64_t device_index);
+void cufft_set_plan_cache_max_size_impl(int64_t device_index, int64_t max_size);
+int64_t cufft_get_plan_cache_size_impl(int64_t device_index);
+void cufft_clear_plan_cache_impl(int64_t device_index);
 
 }}} // namespace at::native::detail
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu
@@ -4,6 +4,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/Utils.h>
 #include <ATen/NativeFunctions.h>
+#include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/native/SpectralOpsUtils.h>
 #include <ATen/native/cuda/CuFFTUtils.h>
 #include <ATen/native/cuda/CuFFTPlanCache.h>
@@ -14,6 +15,7 @@
 #include <thrust/unique.h>
 #include <cufft.h>
 #include <cufftXt.h>
+#include <vector>
 #include <cmath>
 
 namespace at { namespace native {
@@ -260,29 +262,59 @@ static inline Tensor _run_cufft(
 }
 
 // The cuFFT plan cache, defined in CuFFTUtils.h
-struct CuFFTParamsLRUCache plan_cache;
-std::mutex plan_cache_mutex;
+std::vector<optional<CuFFTParamsLRUCache>> plan_caches;
+std::mutex plan_caches_mutex;
+
+static inline
+CuFFTParamsLRUCache &cufft_get_plan_cache(int64_t device_index) {
+  std::lock_guard<std::mutex> guard(plan_caches_mutex);
+
+  AT_ASSERT(device_index >= 0);
+
+  if (device_index >= plan_caches.size()) {
+    plan_caches.resize(device_index + 1);
+  }
+
+  if (!plan_caches[device_index]) {
+    plan_caches[device_index].emplace();
+  }
+
+  return *plan_caches[device_index];
+}
+
 
 namespace detail {
 
-int64_t cufft_get_plan_cache_max_size_impl() {
-  std::lock_guard<std::mutex> guard(plan_cache_mutex);
-  return plan_cache.max_size();
+int64_t cufft_get_plan_cache_max_size_impl(int64_t device_index) {
+  AT_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
+    "cufft_get_plan_cache_max_size: expected 0 <= device_index < ",
+    at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
+    device_index);
+  return cufft_get_plan_cache(device_index).max_size();
 }
 
-void cufft_set_plan_cache_max_size_impl(int64_t max_size) {
-  std::lock_guard<std::mutex> guard(plan_cache_mutex);
-  plan_cache.resize(max_size);
+void cufft_set_plan_cache_max_size_impl(int64_t device_index, int64_t max_size) {
+  AT_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
+    "cufft_set_plan_cache_max_size: expected 0 <= device_index < ",
+    at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
+    device_index);
+  return cufft_get_plan_cache(device_index).resize(max_size);
 }
 
-int64_t cufft_get_plan_cache_size_impl() {
-  std::lock_guard<std::mutex> guard(plan_cache_mutex);
-  return plan_cache.size();
+int64_t cufft_get_plan_cache_size_impl(int64_t device_index) {
+  AT_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
+    "cufft_get_plan_cache_size: expected 0 <= device_index < ",
+    at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
+    device_index);
+  return cufft_get_plan_cache(device_index).size();
 }
 
-void cufft_clear_plan_cache_impl() {
-  std::lock_guard<std::mutex> guard(plan_cache_mutex);
-  return plan_cache.clear();
+void cufft_clear_plan_cache_impl(int64_t device_index) {
+  AT_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
+    "cufft_clear_plan_cache: expected 0 <= device_index < ",
+    at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
+    device_index);
+  return cufft_get_plan_cache(device_index).clear();
 }
 
 } // namespace at::native::detail
@@ -293,6 +325,9 @@ Tensor _fft_cufft(const Tensor& self, int64_t signal_ndim,
                   bool complex_input, bool complex_output, bool inverse,
                   IntArrayRef checked_signal_sizes, bool normalized, bool onesided,
                   IntArrayRef output_sizes) {
+
+  CuFFTParamsLRUCache& plan_cache = cufft_get_plan_cache(self.device().index());
+
   Tensor input = self;
   bool input_was_cloned = false;
 
@@ -334,7 +369,7 @@ Tensor _fft_cufft(const Tensor& self, int64_t signal_ndim,
     CuFFTParams params;
     setCuFFTParams(&params, input, signal_ndim, complex_input,
       complex_output, checked_signal_sizes, onesided);
-    std::lock_guard<std::mutex> guard(plan_cache_mutex);
+    std::lock_guard<std::mutex> guard(plan_cache.mutex);
     if (plan_cache.max_size() > 0) {  // check again after acquiring the lock
       const CuFFTConfig &config = plan_cache.try_emplace_value(std::move(params),
                                              input, signal_ndim, complex_input,
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -917,13 +917,13 @@
     CPU: _fft_mkl
     CUDA: _fft_cufft
 
-- func: _cufft_get_plan_cache_size() -> int
+- func: _cufft_get_plan_cache_size(int device_index) -> int
 
-- func: _cufft_get_plan_cache_max_size() -> int
+- func: _cufft_get_plan_cache_max_size(int device_index) -> int
 
-- func: _cufft_set_plan_cache_max_size(int max_size) -> void
+- func: _cufft_set_plan_cache_max_size(int device_index, int max_size) -> void
 
-- func: _cufft_clear_plan_cache() -> void
+- func: _cufft_clear_plan_cache(int device_index) -> void
 
 - func: index(Tensor self, Tensor?[] indices) -> Tensor
   variants: function, method
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
@@ -272,3 +272,31 @@ There are significant caveats to using CUDA models with
 :mod:`~torch.multiprocessing`; unless care is taken to meet the data handling
 requirements exactly, it is likely that your program will have incorrect or
 undefined behavior.
+
+.. _cufft-plan-cache:
+
+cuFFT plan cache
+^^^^^^^^^^^^^^^^
+
+For each CUDA device, an LRU cache of cuFFT plans is used to speed up repeatedly
+running FFT methods (e.g., :func:`torch.fft`) on CUDA tensors of same geometry
+with same configuration. Because some cuFFT plans may allocate GPU memory,
+these caches have a maximum capacity.
+
+You may control and query the properties of the cache of current device with
+the following APIs:
+
+* ``torch.backends.cuda.cufft_plan_cache.max_size`` gives the capacity of the
+  cache (default is 4096 on CUDA 10 and newer, and 1023 on older CUDA versions).
+  Setting this value directly modifies the capacity.
+
+* ``torch.backends.cuda.cufft_plan_cache.size`` gives the number of plans
+  currently residing in the cache.
+
+* ``torch.backends.cuda.cufft_plan_cache.clear()`` clears the cache.
+
+To control and query plan caches of a non-default device, you can index the
+``torch.backends.cuda.cufft_plan_cache`` object with either a :class:`torch.device`
+object or a device index, and access one of the above attributes. E.g., to set
+the capacity of the cache for device ``1``, one can write
+``torch.backends.cuda.cufft_plan_cache[1].max_size = 10``.
diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -2192,11 +2192,15 @@ def test_fft_ifft_rfft_irfft(self):
         _TestTorchMixin._test_fft_ifft_rfft_irfft(self, device=torch.device('cuda'))
 
         @contextmanager
-        def plan_cache_max_size(n):
-            original = torch.backends.cuda.cufft_plan_cache.max_size
-            torch.backends.cuda.cufft_plan_cache.max_size = n
+        def plan_cache_max_size(n, device=None):
+            if device is None:
+                plan_cache = torch.backends.cuda.cufft_plan_cache
+            else:
+                plan_cache = torch.backends.cuda.cufft_plan_cache[device]
+            original = plan_cache.max_size
+            plan_cache.max_size = n
             yield
-            torch.backends.cuda.cufft_plan_cache.max_size = original
+            plan_cache.max_size = original
 
         with plan_cache_max_size(max(1, torch.backends.cuda.cufft_plan_cache.size - 10)):
             _TestTorchMixin._test_fft_ifft_rfft_irfft(self, device=torch.device('cuda'))
@@ -2216,6 +2220,44 @@ def plan_cache_max_size(n):
         with self.assertRaisesRegex(RuntimeError, r"read-only property"):
             torch.backends.cuda.cufft_plan_cache.size = -1
 
+        with self.assertRaisesRegex(RuntimeError, r"but got device with index"):
+            torch.backends.cuda.cufft_plan_cache[torch.cuda.device_count() + 10]
+
+        if TEST_MULTIGPU:
+            # Test that different GPU has different cache
+            x0 = torch.randn(2, 3, 3, device='cuda:0')
+            x1 = x0.cuda(1)
+            self.assertEqual(x0.rfft(2), x1.rfft(2))
+            # If a plan is used across different devices, the following line (or
+            # the assert above) would trigger illegal memory access. Other ways
+            # to trigger the error include
+            #   (1) setting CUDA_LAUNCH_BLOCKING=1 (pytorch/pytorch#19224) and
+            #   (2) printing a device 1 tensor.
+            x0.copy_(x1)
+
+            # Test that un-indexed `torch.backends.cuda.cufft_plan_cache` uses current device
+            with plan_cache_max_size(10, device='cuda:0'):
+                with plan_cache_max_size(11, device='cuda:1'):
+                    self.assertEqual(torch.backends.cuda.cufft_plan_cache[0].max_size, 10)
+                    self.assertEqual(torch.backends.cuda.cufft_plan_cache[1].max_size, 11)
+
+                    self.assertEqual(torch.backends.cuda.cufft_plan_cache.max_size, 10)  # default is cuda:0
+                    with torch.cuda.device(1):
+                        self.assertEqual(torch.backends.cuda.cufft_plan_cache.max_size, 11)  # default is cuda:1
+                        with torch.cuda.device(0):
+                            self.assertEqual(torch.backends.cuda.cufft_plan_cache.max_size, 10)  # default is cuda:0
+
+                self.assertEqual(torch.backends.cuda.cufft_plan_cache[0].max_size, 10)
+                with torch.cuda.device(1):
+                    with plan_cache_max_size(11):  # default is cuda:1
+                        self.assertEqual(torch.backends.cuda.cufft_plan_cache[0].max_size, 10)
+                        self.assertEqual(torch.backends.cuda.cufft_plan_cache[1].max_size, 11)
+
+                        self.assertEqual(torch.backends.cuda.cufft_plan_cache.max_size, 11)  # default is cuda:1
+                        with torch.cuda.device(0):
+                            self.assertEqual(torch.backends.cuda.cufft_plan_cache.max_size, 10)  # default is cuda:0
+                        self.assertEqual(torch.backends.cuda.cufft_plan_cache.max_size, 11)  # default is cuda:1
+
     def test_stft(self):
         _TestTorchMixin._test_stft(self, device=torch.device('cuda'))
 
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
diff --git a/torch/backends/cuda/__init__.py b/torch/backends/cuda/__init__.py

Original file line number	Diff line number	Diff line change
`@@ -111,19 +111,19 @@ struct CAFFE2_API CUDAHooksInterface {`
`111`	`111`	`"Cannot query batchnormMinEpsilonCuDNN() without ATen_cuda library. ", CUDA_HELP);`
`112`	`112`	`}`
`113`	`113`
`114`		`- virtual int64_t cuFFTGetPlanCacheMaxSize() const {`
	`114`	`+ virtual int64_t cuFFTGetPlanCacheMaxSize(int64_t device_index) const {`
`115`	`115`	`AT_ERROR("Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);`
`116`	`116`	`}`
`117`	`117`
`118`		`- virtual void cuFFTSetPlanCacheMaxSize(int64_t max_size) const {`
	`118`	`+ virtual void cuFFTSetPlanCacheMaxSize(int64_t device_index, int64_t max_size) const {`
`119`	`119`	`AT_ERROR("Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);`
`120`	`120`	`}`
`121`	`121`
`122`		`- virtual int64_t cuFFTGetPlanCacheSize() const {`
	`122`	`+ virtual int64_t cuFFTGetPlanCacheSize(int64_t device_index) const {`
`123`	`123`	`AT_ERROR("Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);`
`124`	`124`	`}`
`125`	`125`
`126`		`- virtual void cuFFTClearPlanCache() const {`
	`126`	`+ virtual void cuFFTClearPlanCache(int64_t device_index) const {`
`127`	`127`	`AT_ERROR("Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);`
`128`	`128`	`}`
`129`	`129`
Original file line number	Diff line number	Diff line change
`@@ -130,20 +130,20 @@ static inline Tensor _fft(const Tensor &self, const int64_t signal_ndim,`
`130`	`130`
`131`	`131`	`// We call the following methods via CUDA hooks because they are really only`
`132`	`132`	`// valid when CUDA is available. See native/cuda/CuFFTPlanCache.h for more details.`
`133`		`-int64_t _cufft_get_plan_cache_max_size() {`
`134`		`- return detail::getCUDAHooks().cuFFTGetPlanCacheMaxSize();`
	`133`	`+int64_t _cufft_get_plan_cache_max_size(int64_t device_index) {`
	`134`	`+ return detail::getCUDAHooks().cuFFTGetPlanCacheMaxSize(device_index);`
`135`	`135`	`}`
`136`	`136`
`137`		`-void _cufft_set_plan_cache_max_size(int64_t max_size) {`
`138`		`- detail::getCUDAHooks().cuFFTSetPlanCacheMaxSize(max_size);`
	`137`	`+void _cufft_set_plan_cache_max_size(int64_t device_index, int64_t max_size) {`
	`138`	`+ detail::getCUDAHooks().cuFFTSetPlanCacheMaxSize(device_index, max_size);`
`139`	`139`	`}`
`140`	`140`
`141`		`-int64_t _cufft_get_plan_cache_size() {`
`142`		`- return detail::getCUDAHooks().cuFFTGetPlanCacheSize();`
	`141`	`+int64_t _cufft_get_plan_cache_size(int64_t device_index) {`
	`142`	`+ return detail::getCUDAHooks().cuFFTGetPlanCacheSize(device_index);`
`143`	`143`	`}`
`144`	`144`
`145`		`-void _cufft_clear_plan_cache() {`
`146`		`- detail::getCUDAHooks().cuFFTClearPlanCache();`
	`145`	`+void _cufft_clear_plan_cache(int64_t device_index) {`
	`146`	`+ detail::getCUDAHooks().cuFFTClearPlanCache(device_index);`
`147`	`147`	`}`
`148`	`148`
`149`	`149`	`Tensor fft(const Tensor& self, const int64_t signal_ndim, const bool normalized) {`