Error out on in-place (unary) ops on tensors that have internal overlap (#17927)

zou3519 · facebook-github-bot · commit 8d8a38e4624d · 2019-03-15T07:50:19.000-07:00
Summary: Pull Request resolved: pytorch/pytorch#17927 ghimport-source-id: 626d321e430b6b5c0ea3aa1eb9df8c1e2d058bf8 Stack: * #17926 Implement at::has_internal_overlap helper function * **#17927 Error out on in-place (unary) ops on tensors that have internal overlap** On the way to #17935. Works for CPU and CUDA on the following ops: - abs_, acos_, asin_, atan_, ceil_, cos_, erf_, erfc_, exp_, expm1_ - floor_, log_, log10_, log1p_, log2_, round_, rsqrt_, - sin_, sqrt_, tan_, tanh_, trunc_ This PR adds a check to see if the out/result tensor has internal overlap. If it does, then we error out because the result **may** be incorrect. This is overly conservative; there are some cases where if the result is the same as the input, the inplace operation is OK (such as floor_, round_, and trunc_). However, the current code isn't organized in such a way that this is easy to check, so enabling those will come in the future. Reviewed By: ezyang Differential Revision: D14438871 fbshipit-source-id: 15e12bf1fdb2ab7f74bb806e22bc74840bd6abd1
diff --git a/aten/src/ATen/MemoryOverlap.cpp b/aten/src/ATen/MemoryOverlap.cpp
@@ -4,24 +4,30 @@
 namespace at {
 
 MemOverlap has_internal_overlap(const Tensor& tensor) {
-  auto* t = tensor.unsafeGetTensorImpl();
+  return has_internal_overlap(tensor.unsafeGetTensorImpl());
+}
 
-  AT_ASSERT(tensor.layout() == kStrided);
+MemOverlap has_internal_overlap(TensorImpl* t) {
+  AT_ASSERT(t->layout() == kStrided);
 
   if (t->is_contiguous()) {
     return MemOverlap::NO;
   }
 
   auto strides = t->strides();
-  if (std::find_if(
-        strides.begin(), strides.end(), [](int s) { return s == 0; })) {
+  if (strides.end() != std::find_if(
+        strides.begin(), strides.end(), [](int64_t s) { return s == 0; })) {
     return MemOverlap::YES;
   }
 
   return MemOverlap::TOO_HARD;
 }
 
 void assert_no_internal_overlap(const Tensor& t, std::string op) {
+  assert_no_internal_overlap(t.unsafeGetTensorImpl(), op);
+}
+
+void assert_no_internal_overlap(TensorImpl* t, std::string op) {
   if (has_internal_overlap(t) == MemOverlap::YES) {
     AT_ERROR(
         op, ": unsupported operation: more than one element of the written-to "
diff --git a/aten/src/ATen/MemoryOverlap.h b/aten/src/ATen/MemoryOverlap.h
@@ -13,8 +13,10 @@ namespace at {
 // NB: Please update the python test for these if you renumber them.
 enum class MemOverlap { NO, YES, TOO_HARD };
 
-MemOverlap has_internal_overlap(const Tensor& t);
+CAFFE2_API MemOverlap has_internal_overlap(const Tensor& t);
+CAFFE2_API MemOverlap has_internal_overlap(TensorImpl* t);
 
-void assert_no_internal_overlap(const Tensor& t, std::string op);
+CAFFE2_API void assert_no_internal_overlap(const Tensor& t, std::string op);
+CAFFE2_API void assert_no_internal_overlap(TensorImpl* t, std::string op);
 
 }
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -7,6 +7,7 @@
 #include <ATen/CPUGenerator.h>
 #include <ATen/CheckGenerator.h>
 #include <ATen/Generator.h>
+#include <ATen/MemoryOverlap.h>
 #include <ATen/cpu/vml.h>
 #include <ATen/CPUApplyUtils.h>
 #include <ATen/native/DispatchStub.h>
@@ -183,6 +184,7 @@ void bernoulli_mkl_kernel(Tensor &self, const double p, Generator* gen) {
             result.data<scalar_t>(), self.data<scalar_t>(), self.numel()); \
                                                                            \
       } else {                                                             \
+        assert_no_internal_overlap(result, #op);                           \
         static constexpr int64_t WIDTH = 131072 / sizeof(scalar_t);        \
         CPU_tensor_parallel_kernel_apply2<scalar_t, scalar_t>(             \
             result,                                                        \
@@ -211,7 +213,6 @@ void bernoulli_mkl_kernel(Tensor &self, const double p, Generator* gen) {
     });                                                                    \
   }                                                                        \
   REGISTER_DISPATCH(op##Impl, &op##_kernel)
-
 } // anonymous namespace
 
 REGISTER_DISPATCH(sigmoidImpl, &sigmoid_kernel)
diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu
@@ -2,6 +2,8 @@
 #define THC_GENERIC_FILE "THC/generic/THCTensorMathPointwise.cu"
 #else
 
+#include <ATen/MemoryOverlap.h>
+
 #define IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_(NAME, CFUNC, REAL)             \
   struct Tensor_##NAME##_##REAL##_Op {                                  \
     __device__ __forceinline__ void operator()(scalar_t* out, scalar_t* in) const { \
@@ -15,6 +17,7 @@
                                                                         \
   void THCTensor_(NAME)(THCState* state, THCTensor* self_, THCTensor* src) { \
     THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));               \
+    at::assert_no_internal_overlap(self_, #NAME);                       \
     if (self_ == src) {                                                 \
       if (!THC_pointwiseApply1<scalar_t>(state, self_, Tensor_##NAME##_##REAL##_Op())) { \
         THArgCheck(false, 2, CUTORCH_DIM_WARNING);                      \