resistor
diff --git a/‎test/test_tensorexpr.py‎
Lines changed: 20 additions & 0 deletions b/‎test/test_tensorexpr.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎torch/csrc/jit/passes/tensorexpr_fuser.cpp‎
Lines changed: 139 additions & 53 deletions b/‎torch/csrc/jit/passes/tensorexpr_fuser.cpp‎
Lines changed: 139 additions & 53 deletions
diff --git a/‎torch/csrc/jit/tensorexpr/codegen.h‎
Lines changed: 12 additions & 0 deletions b/‎torch/csrc/jit/tensorexpr/codegen.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎torch/csrc/jit/tensorexpr/cuda_codegen.cpp‎
Lines changed: 4 additions & 4 deletions b/‎torch/csrc/jit/tensorexpr/cuda_codegen.cpp‎
Lines changed: 4 additions & 4 deletions
@@ -15,6 +15,26 @@ def easy(x, y):
     np.testing.assert_allclose(a.numpy() + b.numpy(), x.numpy())
 
 
+# TODO: combine this with the test_easy
+def test_easy_cuda():
+    if not torch.cuda.is_available():
+        return
+
+    def easy(x, y):
+        aaa = torch.add(x, y)
+        return aaa
+
+    traced = torch.jit.trace(easy, (torch.rand(32, 16, device='cuda'), torch.rand(32, 16, device='cuda')))
+
+    a = torch.rand(32, 16, device='cuda')
+    b = torch.rand(32, 16, device='cuda')
+    x = traced(a, b)
+    a_cpu = a.cpu()
+    b_cpu = b.cpu()
+    x_cpu = x.cpu()
+    np.testing.assert_allclose(a_cpu.numpy() + b_cpu.numpy(), x_cpu.numpy())
+
+
 def test_three_arg():
     def easy(x, y, z):
         aaa = torch.add(x, y)
 
@@ -8,6 +8,7 @@
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/tensorexpr/buffer.h>
+#include <torch/csrc/jit/tensorexpr/cuda_codegen.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
 #include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
 #include <torch/csrc/jit/tensorexpr/schedule.h>
@@ -303,13 +304,23 @@ std::vector<Expr> computeIndicesToBroadcast(
   return bcast;
 }
 
-struct TensorExprKernel {
-  std::vector<Buffer> buffer_args;
-  std::vector<Tensor> tensor_outputs;
-  std::unordered_map<int64_t, Tensor> tensors;
-  std::unique_ptr<CodeGen> codegen;
-  KernelArena kernel_arena;
-
+class TensorExprKernel {
+ private:
+  enum BackendType {
+    kUninitialized,
+    kSimpleIREval,
+    kLLVMCodeGen,
+    kCudaCodeGen,
+  };
+  std::vector<Buffer> buffer_args_;
+  std::vector<Tensor> tensor_outputs_;
+  std::unordered_map<int64_t, Tensor> tensors_;
+  std::unique_ptr<CodeGen> codegen_;
+  KernelArena kernel_arena_;
+  BackendType backend_type_ = BackendType::kUninitialized;
+  at::Device device_ = at::kCPU;
+
+ private:
   Expr constant(torch::jit::Value* v) {
     if (v->node()->kind() == prim::Constant) {
       const auto val = toIValue(v).value();
@@ -332,8 +343,12 @@ struct TensorExprKernel {
   }
 
   template <typename T>
-  Expr chunk(const T& t, size_t chunk_idx, size_t dim, size_t chunks,
-             const std::vector<Var>& axes) {
+  Expr chunk(
+      const T& t,
+      size_t chunk_idx,
+      size_t dim,
+      size_t chunks,
+      const std::vector<Var>& axes) {
     auto sizes = bufferSizes(t);
     size_t step = sizes[dim] / chunks;
 
@@ -375,8 +390,8 @@ struct TensorExprKernel {
   }
 
   Expr tensorOrConstant(torch::jit::Value* v, const std::vector<Var>& axes) {
-    auto ti = tensors.find(v->unique());
-    if (ti != tensors.end()) {
+    auto ti = tensors_.find(v->unique());
+    if (ti != tensors_.end()) {
       return broadcast(ti->second, axes);
     }
     return constant(v);
@@ -699,22 +714,115 @@ struct TensorExprKernel {
     }
   }
 
+  void LowerToBackend(BackendType backend_type) {
+    torch::jit::tensorexpr::schedule::Schedule sch(tensor_outputs_);
+
+    // Compute non-output tensors_ inline
+    for (auto& p : tensors_) {
+      p.second.ComputeInline();
+    }
+    if (backend_type == kCudaCodeGen) {
+      for (auto& output : tensor_outputs_) {
+        // TODO: implement the universal fused dispatching config.
+        if (output.args().size() < 2) {
+          throw std::runtime_error(
+              "Only tensors with more than 2D is supported in CudaCodeGen");
+        }
+        Var x = output.arg(0);
+        Var y = output.arg(1);
+        output.GPUExecConfig({x}, {y});
+      }
+    }
+
+    Stmt stmt = sch.Lower();
+
+    // Set up formal params (inputs, then outputs) for kernel.
+    std::vector<CodeGen::BufferArg> params(
+        buffer_args_.begin(), buffer_args_.end());
+    for (auto& o : tensor_outputs_) {
+      params.push_back(o);
+    }
+
+    // Generate code.
+    switch (backend_type_) {
+      case kCudaCodeGen:
+        codegen_ = std::make_unique<CudaCodeGen>(stmt, params);
+        break;
+      case kLLVMCodeGen:
+        codegen_ = std::make_unique<LLVMCodeGen>(stmt, params);
+        break;
+      case kSimpleIREval:
+        codegen_ = std::make_unique<SimpleIREvaluator>(stmt, params);
+        break;
+      default:
+        throw std::runtime_error("invalid backend type");
+    }
+  }
+
+  void PickAndCheckBackendType(const at::ArrayRef<IValue>& inputs) {
+    at::Device device = inputs[0].toTensor().device();
+    BackendType backend_type = BackendType::kUninitialized;
+    if (device.type() == at::kCUDA) {
+      backend_type = kCudaCodeGen;
+    } else if (device.type() == at::kCPU) {
+#ifdef ENABLE_LLVM
+      backend_type = kLLVMCodeGen;
+#else
+      backend_type = kSimpleIREval;
+      ;
+#endif
+    } else {
+      throw std::runtime_error("Invalid device type");
+    }
+
+    if (backend_type_ == kUninitialized) {
+      backend_type_ = backend_type;
+      device_ = device;
+      LowerToBackend(backend_type);
+    } else if (backend_type_ != backend_type) {
+      // TODO: if we have to support muliptole backends with the same subgraph,
+      // we need to add kernel caching.
+      throw std::runtime_error(
+          "Inconsistent backend_type: " + std::to_string(backend_type_) +
+          " vs " + std::to_string(backend_type));
+    }
+  }
+
+  void CodeGenRun(const std::vector<CodeGen::CallArg>& run_args) {
+    if (backend_type_ == kCudaCodeGen || backend_type_ == kSimpleIREval) {
+      codegen_->call(run_args);
+    } else if (backend_type_ == kLLVMCodeGen) {
+      for (int i = 0; i < buffer_args_.size(); i++) {
+        codegen_->bind(buffer_args_[i], run_args[i]);
+      }
+      int offset = buffer_args_.size();
+      for (int i = 0; i < tensor_outputs_.size(); i++) {
+        codegen_->bind(tensor_outputs_[i], run_args[i + offset]);
+      }
+      codegen_->run();
+    } else {
+      throw std::runtime_error(
+          "Invalid backend type: " + std::to_string(backend_type_));
+    }
+  }
+
+ public:
   explicit TensorExprKernel(const Node* node) {
-    KernelScope kernel_scope(kernel_arena);
+    KernelScope kernel_scope(kernel_arena_);
     auto subgraph = node->g(attr::Subgraph);
 
     // Bind inputs to buffers.
     for (auto const& input : subgraph->inputs()) {
       Buffer in_buffer = texprBuffer(input);
-      tensors.emplace(
+      tensors_.emplace(
           input->unique(),
           Compute(
               "input",
               texprDims(input),
               [this, in_buffer](const std::vector<Var>& axes) {
                 return broadcast(in_buffer, axes);
               }));
-      buffer_args.push_back(std::move(in_buffer));
+      buffer_args_.push_back(std::move(in_buffer));
     }
 
     // Bind nodes to tensor compute expressions.
@@ -730,58 +838,36 @@ struct TensorExprKernel {
       }
     }
 
-    // Move output operands from `tensors` to `tensor_outputs`
+    // Move output operands from `tensors_` to `tensor_outputs_`
     for (const auto& output : subgraph->outputs()) {
-      CHECK(tensors.count(output->unique())) << "Output must be a tensor";
-      tensor_outputs.emplace_back(tensors.at(output->unique()));
-      tensors.erase(output->unique());
+      CHECK(tensors_.count(output->unique())) << "Output must be a tensor";
+      tensor_outputs_.emplace_back(tensors_.at(output->unique()));
+      tensors_.erase(output->unique());
     }
-
-    torch::jit::tensorexpr::schedule::Schedule sch(tensor_outputs);
-
-    // Compute non-output tensors inline
-    for (auto& p : tensors) {
-      p.second.ComputeInline();
-    }
-    Stmt stmt = sch.Lower();
-
-#if TX_DEBUG
-    std::cerr << stmt << "\n";
-#endif
-
-#ifdef ENABLE_LLVM
-    // Set up formal params (inputs, then outputs) for kernel.
-    std::vector<CodeGen::BufferArg> params(
-        buffer_args.begin(), buffer_args.end());
-    for (auto& o : tensor_outputs) {
-      params.push_back(o);
-    }
-
-    // Generate code.
-    codegen = std::make_unique<LLVMCodeGen>(stmt, params);
-#else
-    codegen = std::make_unique<SimpleIREvaluator>(stmt);
-#endif
   }
 
   void run(Stack& stack) {
-    KernelScope kernel_scope(kernel_arena);
+    KernelScope kernel_scope(kernel_arena_);
     // Set up arguments (inputs, then outputs) for kernel call.
-    auto inputs = last(stack, buffer_args.size());
-    for (int i = 0; i < buffer_args.size(); i++) {
-      codegen->bind(buffer_args[i], inputs[i].toTensor().data_ptr());
+    auto inputs = last(stack, buffer_args_.size());
+    PickAndCheckBackendType(inputs);
+
+    std::vector<CodeGen::CallArg> run_args;
+    for (int i = 0; i < buffer_args_.size(); i++) {
+      run_args.push_back(inputs[i].toTensor().data_ptr());
     }
     std::vector<at::Tensor> outputs;
-    for (auto& o : tensor_outputs) {
-      outputs.push_back(at::empty(bufferSizes(o), tensorType(o)));
-      codegen->bind(o, outputs.back().data_ptr());
+    for (auto& o : tensor_outputs_) {
+      outputs.push_back(at::empty(
+          bufferSizes(o), c10::TensorOptions(tensorType(o)).device(device_)));
+      run_args.push_back(outputs.back().data_ptr());
     }
 
     // Call the kernel.
-    codegen->run();
+    CodeGenRun(run_args);
 
     // Update the stack.
-    drop(stack, buffer_args.size());
+    drop(stack, buffer_args_.size());
     for (auto& o : outputs) {
       push_one(stack, std::move(o));
     }
 
@@ -21,11 +21,19 @@ class CodeGen {
       : ir_node_(const_cast<BaseStmtNode*>(stmt.node())),
         buffer_args_({BufferArg(ts)...}) {}
 
+  CodeGen(const Stmt& stmt, const std::vector<BufferArg>& buffer_args)
+      : ir_node_(const_cast<BaseStmtNode*>(stmt.node())),
+        buffer_args_(buffer_args) {}
+
   template <typename... Ts>
   CodeGen(const Expr& expr, Ts... ts)
       : ir_node_(const_cast<BaseExprNode*>(expr.node())),
         buffer_args_({BufferArg(ts)...}) {}
 
+ CodeGen(const Expr& expr, const std::vector<BufferArg>& buffer_args)
+      : ir_node_(const_cast<BaseExprNode*>(expr.node())),
+        buffer_args_(buffer_args) {}
+
   CodeGen(const IRNode* node) : ir_node_(const_cast<IRNode*>(node)) {}
 
   virtual ~CodeGen() {}
@@ -54,6 +62,10 @@ class CodeGen {
     LOG(FATAL) << "Unimplemented interface";
   }
 
+  TORCH_API virtual void call(const std::vector<CallArg>& args) {
+    LOG(FATAL) << "unimplemented call";
+  }
+  
  private:
   IRNode* ir_node_ = nullptr;
   std::vector<BufferArg> buffer_args_;
 
@@ -90,7 +90,7 @@ void CudaPrinter::visit(const For* v) {
   const LoopOptions& loop_options = v->loop_options();
   if (loop_options.is_gpu_block_index()) {
     ScopedVarName var_name(
-        name_manager_, v->var().node(), loop_options.gpu_block_index_str());
+			   name_manager(), v->var().node(), loop_options.gpu_block_index_str());
     v->body().accept(this);
     int gpu_block_index = loop_options.gpu_block_index();
     if (gpu_block_extents_.size() <= gpu_block_index) {
@@ -104,7 +104,7 @@ void CudaPrinter::visit(const For* v) {
     gpu_block_extents_[gpu_block_index] = v->stop();
   } else if (loop_options.is_gpu_thread_index()) {
     ScopedVarName var_name(
-        name_manager_, v->var().node(), loop_options.gpu_thread_index_str());
+			   name_manager(), v->var().node(), loop_options.gpu_thread_index_str());
     v->body().accept(this);
     int gpu_thread_index = loop_options.gpu_thread_index();
     if (gpu_thread_extents_.size() <= gpu_thread_index) {
@@ -122,7 +122,7 @@ void CudaPrinter::visit(const For* v) {
 }
 
 void CudaCodeGen::Initialize() {
-  printer_.reset(new CudaPrinter(&oss_, &name_manager_));
+  printer_.reset(new CudaPrinter(&oss_));
   // TODO: handle multiple kernels.
   // TODO: handle dynamic dimension.
   // TODO: call nvrtc.
@@ -135,7 +135,7 @@ void CudaCodeGen::Initialize() {
     const BufferArg& buffer_arg = buffer_args[i];
     const Var& var = buffer_arg.var();
     Dtype dtype = buffer_arg.dtype();
-    oss_ << dtype.ToCppString() << "* " << name_manager_.get_unique_name(var);
+    oss_ << dtype.ToCppString() << "* " << name_manager()->get_unique_name(var);
   }
   oss_ << ") {";