[RFC] Add LoopNest class that implements Schedule's API in a different way.

Mikhail Zolotukhin · Mikhail Zolotukhin · commit 0c73c32ad18d · 2020-02-27T11:08:29.000-08:00
LoopNest is my attempt to simplify our core abstraction. The main idea
behind this change is to merge two classes: `TensorExprNode` and `For`
(derived from `Stmt`). Currently they represent basically the same
thing, but in a slightly different way. `TensorExprNode` attaches some
metadata and provides a different way for traversing through
siblings/parents/children. `For` represents the same structure, but
without any metadata. Once a kernel is lowered to `For` statements, they
are immediately consumed by a codegen, which lowers them to LLVMIR or
prints as a CUDA string.

This PR adds some functionality to `For` statements (and to other types
of statements as well) and implements `SplitWithTail` and
`ComputeInline` using only those.  The implementation is just a proof of
concept: it doesn't cover all corner cases, but they are trivial to add.

As a demo, I added a test where we create a simple tensor-expression,
then split one of the axis and then lower it to a Stmt. The demo shows
that we're producing exactly the same result.

For the reference, below is the output of the test (Root stmt - produced
by the new implementation, Ref stmt - the product of the existing one):
```
[ RUN      ] TensorExprTest.LoopNest_LLVM
Root stmt:
for (int n = 0; n &lt; N; n++) {
  for (int i = 0; i &lt; 1024; i++) {
    for (int j_outer = 0; j_outer &lt; ((256 - 0) / 17); j_outer++) {
      for (int j_inner = 0; j_inner &lt; 17; j_inner++) {
        g[(((n * (1024 * 256)) + (i * 256)) + (((j_outer * 17) + j_inner) * 1))] = (((A[(((n * ((1 * 256) * 1024)) + (i * (1 * 256))) + ((j_outer * 17) + j_inner))] + B[(((n * ((1 * 256) * 1024)) + (i * (1 * 256))) + ((j_outer * 17) + j_inner))]) + C[(((n * ((1 * 256) * 1024)) + (i * (1 * 256))) + ((j_outer * 17) + j_inner))]) + D[(((n * ((1 * 256) * 1024)) + (i * (1 * 256))) + ((j_outer * 17) + j_inner))]);
      }
    }
    for (int j_tail = 0; j_tail &lt; ((256 - 0) % 17); j_tail++) {
      g[(((n * (1024 * 256)) + (i * 256)) + ((j_tail + (((256 - 0) / 17) * 17)) * 1))] = (((A[(((n * ((1 * 256) * 1024)) + (i * (1 * 256))) + (j_tail + (((256 - 0) / 17) * 17)))] + B[(((n * ((1 * 256) * 1024)) + (i * (1 * 256))) + (j_tail + (((256 - 0) / 17) * 17)))]) + C[(((n * ((1 * 256) * 1024)) + (i * (1 * 256))) + (j_tail + (((256 - 0) / 17) * 17)))]) + D[(((n * ((1 * 256) * 1024)) + (i * (1 * 256))) + (j_tail + (((256 - 0) / 17) * 17)))]);
    }
  }
}

Ref stmt:
for (int n = 0; n &lt; N; n++) {
  for (int i = 0; i &lt; 1024; i++) {
    for (int j_outer = 0; j_outer &lt; ((256 - 0) / 17); j_outer++) {
      for (int j_inner = 0; j_inner &lt; 17; j_inner++) {
        g[(((n * (1024 * 256)) + (i * 256)) + (((j_outer * 17) + j_inner) * 1))] = (((A[(((n * ((1 * 256) * 1024)) + (i * (1 * 256))) + ((j_outer * 17) + j_inner))] + B[(((n * ((1 * 256) * 1024)) + (i * (1 * 256))) + ((j_outer * 17) + j_inner))]) + C[(((n * ((1 * 256) * 1024)) + (i * (1 * 256))) + ((j_outer * 17) + j_inner))]) + D[(((n * ((1 * 256) * 1024)) + (i * (1 * 256))) + ((j_outer * 17) + j_inner))]);
      }
    }
    for (int j_tail = 0; j_tail &lt; ((256 - 0) % 17); j_tail++) {
      g[(((n * (1024 * 256)) + (i * 256)) + ((j_tail + (((256 - 0) / 17) * 17)) * 1))] = (((A[(((n * ((1 * 256) * 1024)) + (i * (1 * 256))) + (j_tail + (((256 - 0) / 17) * 17)))] + B[(((n * ((1 * 256) * 1024)) + (i * (1 * 256))) + (j_tail + (((256 - 0) / 17) * 17)))]) + C[(((n * ((1 * 256) * 1024)) + (i * (1 * 256))) + (j_tail + (((256 - 0) / 17) * 17)))]) + D[(((n * ((1 * 256) * 1024)) + (i * (1 * 256))) + (j_tail + (((256 - 0) / 17) * 17)))]);
    }
  }
}
[       OK ] TensorExprTest.LoopNest_LLVM (3 ms)
```
diff --git a/test/cpp/tensorexpr/test_schedule.cpp b/test/cpp/tensorexpr/test_schedule.cpp
@@ -545,5 +545,67 @@ void testScheduleDynamicShape2D() {
   testWithSize(37, 11);
 }
 
+void testLoopNest() {
+
+  KernelScope kernel_scope;
+  const int kVectorSize = 8;
+  const int kVectorCount = 128;
+  const int kSize1 = 1024;
+  const int kSize2 = 256;
+
+  VarHandle n("N", kHandle);
+  Buffer a(VarHandle("A", kHandle), kFloat, {n, ExprHandle(kSize1), ExprHandle(kSize2)});
+  Buffer b(VarHandle("B", kHandle), kFloat, {n, ExprHandle(kSize1), ExprHandle(kSize2)});
+  Buffer c(VarHandle("C", kHandle), kFloat, {n, ExprHandle(kSize1), ExprHandle(kSize2)});
+  Buffer d(VarHandle("D", kHandle), kFloat, {n, ExprHandle(kSize1), ExprHandle(kSize2)});
+
+  Tensor* e = Compute(
+      "e",
+      {{n, "n"}, {kSize1, "i"}, {kSize2, "j"}},
+      [&](const VarHandle& n, const VarHandle& i, const VarHandle& j) {
+        return a(n, i, j) + b(n, i, j);
+      });
+  Tensor* f = Compute(
+      "f",
+      {{n, "n"}, {kSize1, "i"}, {kSize2, "j"}},
+      [&](const VarHandle& n, const VarHandle& i, const VarHandle& j) {
+        return (*e)(n, i, j) + c(n, i, j);
+      });
+  Tensor* g = Compute(
+      "g",
+      {{n, "n"}, {kSize1, "i"}, {kSize2, "j"}},
+      [&](const VarHandle& n, const VarHandle& i, const VarHandle& j) {
+        return (*f)(n, i, j) + d(n, i, j);
+      });
+
+
+  // NEW API:
+  {
+    LoopNest l({e, f, g});
+    l.ComputeInline(l.getLoopBodyFor(e));
+    l.ComputeInline(l.getLoopBodyFor(f));
+    std::vector<Stmt*> loops =
+        l.getLoopStmtsFor(g); // gives a list of loops from outer to inner
+    Stmt *j_outer, *j_inner, *j_tail;
+    l.SplitWithTail(loops[2], 17, &j_outer, &j_inner, &j_tail);
+    l.ApplyInlines();
+    std::cerr << "Root stmt:\n" << *l.root_stmt();
+  }
+
+  // CURRENT API:
+  {
+    Schedule sch({g});
+    e->ComputeInline();
+    f->ComputeInline();
+    VarHandle j(g->function()->arg(2));
+    VarHandle j_outer, j_inner, j_tail;
+    TensorOperation* tail_op;
+    g->SplitWithTail(j, 17, true, &j_outer, &j_inner, &j_tail, &tail_op);
+    Stmt* s = sch.Lower();
+    std::cerr << "Ref stmt:\n" << *s;
+  }
+  // Produced Stmts are identical in both Current and New APIs
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h
@@ -160,7 +160,8 @@ namespace jit {
   _(LLVMBindDynamicShapeAdd)    \
   _(LLVMTensorDynamicShapeAdd)  \
   _(LLVMDynamicShape2D)         \
-  _(LLVMIfThenElseTest)
+  _(LLVMIfThenElseTest)         \
+  _(LoopNest)
 
 #define TH_FORALL_TESTS_CUDA(_) \
   _(CudaTestVectorAdd01)        \
diff --git a/torch/csrc/jit/tensorexpr/schedule.cpp b/torch/csrc/jit/tensorexpr/schedule.cpp
@@ -873,6 +873,109 @@ LoopAxis* LoopAxisTransform::NewAxis(
   return axis;
 }
 
+// XXX
+LoopNest::LoopNest(const std::vector<Tensor*> tensors_to_compute) {
+  std::vector<Tensor*> output_tensors(tensors_to_compute);
+
+  std::vector<Stmt*> loops;
+  for (Tensor *t : tensors_to_compute) {
+    Stmt* loop = LowerToStmt(t);
+    loops.push_back(loop);
+  }
+  root_stmt_ = new Block(loops);
+}
+
+Stmt* LoopNest::LowerToStmt(Tensor* t) {
+  Function* f = t->function();
+  // TODO: Support multiple-output functions
+  Stmt* body = f->ElementStmt(0);
+
+  stmt_to_tensor_[body] = t;
+  tensor_to_stmt_[t] = body;
+
+  CHECK(f->ndim() >= 1);
+  for (size_t i = 0; i < f->ndim(); i++) {
+    // Going in reverse order: from innermost loop to the outermost
+    size_t dim_index = f->ndim() - i - 1;
+    Range r(0, ExprHandle(f->dim(dim_index)));
+    body = For::make(
+        VarHandle(f->arg(dim_index)), r.start(), r.stop(), body);
+  }
+  return body;
+}
+
+void LoopNest::ComputeInline(Stmt *s) {
+  // TODO: check if `s` is a body of a loop
+  inlined_functions_.insert(stmt_to_tensor_.at(s)->function());
+}
+
+void LoopNest::ApplyInlines() {
+  // TODO: check if `s` is a body of a loop
+  std::vector<Function*> inlined_functions_vec(
+      inlined_functions_.begin(), inlined_functions_.end());
+  root_stmt_ = InjectInlines(root_stmt_, inlined_functions_vec);
+}
+
+void LoopNest::SplitWithTail(Stmt *s, int factor, Stmt** inner, Stmt **outer, Stmt **tail) {
+  Block* p = dynamic_cast<Block*>(s->parent_);
+  For* f = dynamic_cast<For*>(s);
+  if (!f) {
+    std::cerr << "Stmt is not a For loop!\n";
+    return;
+  }
+  if (!p) {
+    std::cerr << "Parent is not a Block!\n";
+    return;
+  }
+  auto const& size = ExprHandle(f->stop()) - ExprHandle(f->start());
+  auto const& split_count = size / factor;
+  auto const& tail_size = size % factor;
+
+  // TODO: handle a special case when the bounds are known and no tail loop is
+  // needed.
+
+  const std::string& loop_var_name = f->var()->name_hint();
+  Dtype loop_var_dtype = f->var()->dtype();
+
+  VarHandle i_inner(loop_var_name + "_inner", loop_var_dtype);
+  VarHandle i_outer(loop_var_name + "_outer", loop_var_dtype);
+  VarHandle i_tail(loop_var_name + "_tail", loop_var_dtype);
+
+  // x -> x.outer * inner.size + x.inner
+  auto combined_index1 = i_outer * factor + i_inner;
+  // x -> x.tail + outer.size * inner.size
+  auto combined_index2 = i_tail + split_count * factor;
+
+  Stmt* body_inner = Substitute(f->body(), {{f->var(), combined_index1}});
+  Stmt* body_tail = Substitute(f->body(), {{f->var(), combined_index2}});
+
+  *inner = For::make(i_inner, 0, factor, body_inner);
+  *outer = For::make(i_outer, 0, split_count, *inner);
+  *tail = For::make(i_tail, 0, tail_size, body_tail);
+
+  // TODO: cleanup API for adding/removing statements
+  p->replace_stmt(s, *outer);
+  p->append_stmt(*tail);
+
+  // TODO: record history of transformations
+}
+
+std::vector<Stmt*> LoopNest::getLoopStmtsFor(Tensor* t) const {
+  std::vector<Stmt*> result;
+  Stmt* cur_stmt = tensor_to_stmt_.at(t);
+  while (cur_stmt) {
+    if (auto *loop = dynamic_cast<For*>(cur_stmt)) {
+      result.push_back(cur_stmt);
+    }
+    cur_stmt = cur_stmt->parent_;
+  }
+  return std::vector(result.rbegin(), result.rend());
+}
+
+Stmt* LoopNest::getLoopBodyFor(Tensor* t) const {
+  return tensor_to_stmt_.at(t);
+}
+
 } // namespace schedule
 } // namespace tensorexpr
 } // namespace jit
diff --git a/torch/csrc/jit/tensorexpr/schedule.h b/torch/csrc/jit/tensorexpr/schedule.h
@@ -671,6 +671,29 @@ class TORCH_API Schedule {
   ScheduleNode* node_ = nullptr;
 };
 
+class TORCH_API LoopNest {
+  public:
+    LoopNest(const std::vector<Tensor*> tensors_to_compute);
+    Stmt* root_stmt() const {
+      return root_stmt_;
+    }
+
+    std::vector<Stmt*> getLoopStmtsFor(Tensor*) const;
+    Stmt* getLoopBodyFor(Tensor*) const;
+    std::unordered_map<Tensor*, Stmt*> tensor_to_stmt_;
+
+    void ComputeInline(Stmt* s);
+    void ApplyInlines();
+    void SplitWithTail(Stmt *s, int factor, Stmt** inner, Stmt **outer, Stmt **tail);
+
+   private:
+    Stmt* LowerToStmt(Tensor *t);
+
+    std::unordered_set<Function*> inlined_functions_;
+    std::unordered_map<Stmt*, Tensor*> stmt_to_tensor_;
+    Stmt* root_stmt_;
+};
+
 } // namespace schedule
 } // namespace tensorexpr
 } // namespace jit
diff --git a/torch/csrc/jit/tensorexpr/stmt.h b/torch/csrc/jit/tensorexpr/stmt.h
@@ -16,6 +16,8 @@ class Stmt : public KernelScopedObject {
   Stmt() {}
   TORCH_API virtual void accept(IRVisitor* visitor) const = 0;
   virtual Stmt* accept_mutator(IRMutator* mutator) = 0;
+
+  Stmt* parent_ = nullptr;
 };
 
 template <class Op>
@@ -84,9 +86,26 @@ class Block : public StmtNode<Block> {
   Stmt* stmt(int index) const {
     return stmts_[index];
   }
+  void append_stmt(Stmt *s) {
+    stmts_.push_back(s);
+  }
+  bool replace_stmt(Stmt* old_stmt, Stmt* new_stmt) {
+    for (size_t i = 0; i < stmts_.size(); i++) {
+      if (stmts_[i] == old_stmt) {
+        stmts_[i] = new_stmt;
+        return true;
+      }
+    }
+    return false;
+  }
 
+  explicit Block(const std::vector<Stmt*>& stmts) : stmts_(stmts) {
+    for (auto s : stmts) {
+      s->parent_ = this;
+    }
+  }
  private:
-  explicit Block(const std::vector<Stmt*>& stmts) : stmts_(stmts) {}
+  // TODO: change to a list to facilitate insertions and removals
   std::vector<Stmt*> stmts_;
 };
 
@@ -358,8 +377,14 @@ class For : public StmtNode<For> {
   }
 
   For(const Var* var, const Expr* start, const Expr* stop, Stmt* body)
-      : var_(var), start_(start), stop_(stop), body_(body) {
+      : var_(var), start_(start), stop_(stop) {
           CHECK(var && start && stop && body);
+          Block *b = dynamic_cast<Block*>(body);
+          if (!b) {
+            b = new Block({body});
+          }
+          body_ = b;
+          body_->parent_ = this;
       }
 
   For(const Var* var,
@@ -370,9 +395,14 @@ class For : public StmtNode<For> {
       : var_(var),
         start_(start),
         stop_(stop),
-        body_(body),
         loop_options_(loop_options) {
           CHECK(var && start && stop && body);
+          Block *b = dynamic_cast<Block*>(body);
+          if (!b) {
+            b = new Block({body});
+          }
+          body_ = b;
+          body_->parent_ = this;
         }
 
  private: