Merge pull request #1222 from bstatcomp/feature/issue-1221-opencl-prim-mdivide-left-tri

rok-cesnovar · web-flow · commit 22072c62a610 · 2019-06-08T08:06:17.000+02:00
Feature/issue 1221 OpenCL implementation of primitive mdivide_left_tri
diff --git a/stan/math/opencl/cholesky_decompose.hpp b/stan/math/opencl/cholesky_decompose.hpp
@@ -6,7 +6,7 @@
 #include <stan/math/opencl/kernels/cholesky_decompose.hpp>
 #include <stan/math/opencl/multiply.hpp>
 #include <stan/math/opencl/multiply_transpose.hpp>
-#include <stan/math/opencl/lower_tri_inverse.hpp>
+#include <stan/math/opencl/tri_inverse.hpp>
 #include <stan/math/opencl/transpose.hpp>
 #include <stan/math/opencl/subtract.hpp>
 #include <stan/math/opencl/err/check_diagonal_zeros.hpp>
@@ -78,7 +78,7 @@ inline void cholesky_decompose(matrix_cl& A) {
   // and copies the resulting submatrix to the lower left hand corner of A
   matrix_cl L_21
       = opencl::multiply<TriangularViewCL::Entire, TriangularViewCL::Upper>(
-          A_21, transpose(lower_triangular_inverse(A_11)));
+          A_21, transpose(tri_inverse<TriangularViewCL::Lower>(A_11)));
   A.sub_block(L_21, 0, 0, block, 0, block_subset, block);
   matrix_cl A_22(block_subset, block_subset);
   A_22.sub_block(A, block, block, 0, 0, block_subset, block_subset);
diff --git a/stan/math/opencl/kernels/diag_inv.hpp b/stan/math/opencl/kernels/diag_inv.hpp
@@ -36,7 +36,7 @@ static const char* diag_inv_kernel_code = STRINGIFY(
      * @param rows The number of rows for A.
      * @note Code is a <code>const char*</code> held in
      * <code>diag_inv_kernel_code.</code>
-     *  Used in math/opencl/lower_tri_inverse.hpp.
+     *  Used in math/opencl/tri_inverse.hpp.
      *  This kernel uses the helper macros available in helpers.cl.
      */
     __kernel void diag_inv(__global double* A, __global double* tmp_inv,
diff --git a/stan/math/opencl/kernels/inv_lower_tri_multiply.hpp b/stan/math/opencl/kernels/inv_lower_tri_multiply.hpp
@@ -39,7 +39,7 @@ static const char* inv_lower_tri_multiply_kernel_code = STRINGIFY(
      * @param rows The number of rows in a single matrix of the batch
      * @note Code is a <code>const char*</code> held in
      * <code>inv_lower_tri_multiply_kernel_code.</code>
-     *  Used in math/opencl/lower_tri_inverse.hpp.
+     *  Used in math/opencl/tri_inverse.hpp.
      *  This kernel uses the helper macros available in helpers.cl.
      */
     __kernel void inv_lower_tri_multiply(__global double* A,
diff --git a/stan/math/opencl/kernels/neg_rect_lower_tri_multiply.hpp b/stan/math/opencl/kernels/neg_rect_lower_tri_multiply.hpp
@@ -33,7 +33,7 @@ static const char* neg_rect_lower_tri_multiply_kernel_code = STRINGIFY(
      * @param rows The number of rows in a single matrix of the batch
      * @note Code is a <code>const char*</code> held in
      *  neg_rect_lower_tri_multiply_kernel_code
-     *  Used in math/opencl/lower_tri_inverse.hpp.
+     *  Used in math/opencl/tri_inverse.hpp.
      *  This kernel uses the helper macros available in helpers.cl.
      */
     __kernel void neg_rect_lower_tri_multiply(
diff --git a/stan/math/opencl/opencl.hpp b/stan/math/opencl/opencl.hpp
@@ -9,7 +9,7 @@
 #include <stan/math/opencl/cholesky_decompose.hpp>
 #include <stan/math/opencl/diagonal_multiply.hpp>
 #include <stan/math/opencl/identity.hpp>
-#include <stan/math/opencl/lower_tri_inverse.hpp>
+#include <stan/math/opencl/tri_inverse.hpp>
 #include <stan/math/opencl/matrix_cl.hpp>
 #include <stan/math/opencl/multiply.hpp>
 #include <stan/math/opencl/multiply_transpose.hpp>
diff --git a/stan/math/opencl/opencl_context.hpp b/stan/math/opencl/opencl_context.hpp
@@ -195,6 +195,9 @@ class opencl_context_base {
     int cholesky_rev_block_partition = 8;
     // used in math/opencl/multiply
     int multiply_split_upper_limit = 2000000;
+    // used in math/prim/mat/fun/mdivide_left_tri
+    // and math/rev/mat/fun/mdivide_left_tri
+    int tri_inverse_size_worth_transfer = 100;
   } tuning_opts_;
 
   static opencl_context_base& getInstance() {
diff --git a/stan/math/opencl/tri_inverse.hpp b/stan/math/opencl/tri_inverse.hpp
@@ -1,13 +1,14 @@
-#ifndef STAN_MATH_OPENCL_LOWER_TRI_INVERSE_HPP
-#define STAN_MATH_OPENCL_LOWER_TRI_INVERSE_HPP
+#ifndef STAN_MATH_OPENCL_TRI_INVERSE_HPP
+#define STAN_MATH_OPENCL_TRI_INVERSE_HPP
 
 #ifdef STAN_OPENCL
 #include <stan/math/opencl/matrix_cl.hpp>
+#include <stan/math/opencl/constants.hpp>
 #include <stan/math/opencl/kernels/diag_inv.hpp>
 #include <stan/math/opencl/kernels/inv_lower_tri_multiply.hpp>
 #include <stan/math/opencl/kernels/neg_rect_lower_tri_multiply.hpp>
 #include <stan/math/opencl/err/check_opencl.hpp>
-
+#include <stan/math/opencl/transpose.hpp>
 #include <stan/math/opencl/identity.hpp>
 #include <stan/math/opencl/err/check_square.hpp>
 #include <stan/math/opencl/sub_block.hpp>
@@ -19,22 +20,26 @@
 namespace stan {
 namespace math {
 /**
- * Computes the inverse of the lower triangular matrix
+ * Computes the inverse of a triangular matrix
  *
  * For a full guide to how this works and fits into Cholesky decompositions,
  * see the reference report
  * <a href="https://github.com/SteveBronder/stancon2018/blob/master/report.pdf">
  * here</a> and kernel doc
  * <a href="https://github.com/stan-dev/math/wiki/GPU-Kernels">here</a>.
  *
+ * @tparam triangular_view the triangularity of the input matrix
  * @param A matrix on the OpenCL device
  * @return the inverse of A
  *
  * @throw <code>std::invalid_argument</code> if the matrix
  *    is not square
  */
-inline matrix_cl lower_triangular_inverse(const matrix_cl& A) {
-  check_square("lower_triangular_inverse (OpenCL)", "A", A);
+template <TriangularViewCL triangular_view>
+inline matrix_cl tri_inverse(const matrix_cl& A) {
+  static_assert(triangular_view != TriangularViewCL::Entire,
+                "tri_inverse(OpenCL) only supports triangular input matrices");
+  check_square("tri_inverse (OpenCL)", "A", A);
 
   int thread_block_2D_dim = 32;
   int max_1D_thread_block_size = opencl_context.max_thread_block_size();
@@ -69,7 +74,9 @@ inline matrix_cl lower_triangular_inverse(const matrix_cl& A) {
   zero_mat.zeros<stan::math::TriangularViewCL::Entire>();
   temp.zeros<stan::math::TriangularViewCL::Entire>();
   inv_padded.zeros<stan::math::TriangularViewCL::Entire>();
-
+  if (triangular_view == TriangularViewCL::Upper) {
+    inv_mat = transpose(inv_mat);
+  }
   int work_per_thread
       = opencl_kernels::inv_lower_tri_multiply.make_functor.get_opts().at(
           "WORK_PER_THREAD");
@@ -95,6 +102,9 @@ inline matrix_cl lower_triangular_inverse(const matrix_cl& A) {
   inv_padded.zeros<stan::math::TriangularViewCL::Upper>();
   if (parts == 1) {
     inv_mat.sub_block(inv_padded, 0, 0, 0, 0, inv_mat.rows(), inv_mat.rows());
+    if (triangular_view == TriangularViewCL::Upper) {
+      inv_mat = transpose(inv_mat);
+    }
     return inv_mat;
   }
   parts = ceil(parts / 2.0);
@@ -132,7 +142,10 @@ inline matrix_cl lower_triangular_inverse(const matrix_cl& A) {
     inv_padded.zeros<stan::math::TriangularViewCL::Upper>();
   }
   // un-pad and return
-  inv_mat.sub_block(inv_padded, 0, 0, 0, 0, A.rows(), A.rows());
+  inv_mat.sub_block(inv_padded, 0, 0, 0, 0, inv_mat.rows(), inv_mat.rows());
+  if (triangular_view == TriangularViewCL::Upper) {
+    inv_mat = transpose(inv_mat);
+  }
   return inv_mat;
 }
 }  // namespace math
diff --git a/stan/math/prim/mat/fun/mdivide_left_tri.hpp b/stan/math/prim/mat/fun/mdivide_left_tri.hpp
@@ -6,14 +6,27 @@
 #include <stan/math/prim/mat/fun/promote_common.hpp>
 #include <stan/math/prim/mat/err/check_multiplicable.hpp>
 #include <stan/math/prim/mat/err/check_square.hpp>
-
+#ifdef STAN_OPENCL
+#include <stan/math/opencl/opencl_context.hpp>
+#include <stan/math/opencl/multiply.hpp>
+#include <stan/math/opencl/tri_inverse.hpp>
+#include <stan/math/opencl/transpose.hpp>
+#include <stan/math/opencl/copy.hpp>
+#endif
 namespace stan {
 namespace math {
 
 /**
- * Returns the solution of the system Ax=b when A is triangular
- * @param A Triangular matrix.  Specify upper or lower with TriView
- * being Eigen::Upper or Eigen::Lower.
+ * Returns the solution of the system Ax=b when A is triangular.
+ * @tparam TriView Specifies whether A is upper (Eigen::Upper)
+ * or lower triangular (Eigen::Lower).
+ * @tparam T1 type of elements in A
+ * @tparam T2 type of elements in b
+ * @tparam R1 number of rows in A
+ * @tparam C1 number of columns in A
+ * @tparam R2 number of rows in b
+ * @tparam C2 number of columns in b
+ * @param A Triangular matrix.
  * @param b Right hand side matrix or vector.
  * @return x = A^-1 b, solution of the linear system.
  * @throws std::domain_error if A is not square or the rows of b don't
@@ -36,8 +49,10 @@ mdivide_left_tri(const Eigen::Matrix<T1, R1, C1> &A,
 
 /**
  * Returns the solution of the system Ax=b when A is triangular and b=I.
- * @param A Triangular matrix.  Specify upper or lower with TriView
- * being Eigen::Upper or Eigen::Lower.
+ * @tparam T type of elements in A
+ * @tparam R1 number of rows in A
+ * @tparam C1 number of columns in A
+ * @param A Triangular matrix.
  * @return x = A^-1 .
  * @throws std::domain_error if A is not square
  */
@@ -52,6 +67,85 @@ inline Eigen::Matrix<T, R1, C1> mdivide_left_tri(
   return b;
 }
 
+/**
+ * Returns the solution of the system Ax=b when A is triangular
+ * and A and b are matrices of doubles.
+ * @tparam TriView Specifies whether A is upper (Eigen::Upper)
+ * or lower triangular (Eigen::Lower).
+ * @tparam R1 number of rows in A
+ * @tparam C1 number of columns in A
+ * @tparam R2 number of rows in b
+ * @tparam C2 number of columns in b
+ * @param A Triangular matrix.
+ * @param b Right hand side matrix or vector.
+ * @return x = A^-1 b, solution of the linear system.
+ * @throws std::domain_error if A is not square or the rows of b don't
+ * match the size of A.
+ */
+template <int TriView, int R1, int C1, int R2, int C2>
+inline Eigen::Matrix<double, R1, C2> mdivide_left_tri(
+    const Eigen::Matrix<double, R1, C1> &A,
+    const Eigen::Matrix<double, R2, C2> &b) {
+  check_square("mdivide_left_tri", "A", A);
+  check_multiplicable("mdivide_left_tri", "A", A, "b", b);
+#ifdef STAN_OPENCL
+  if (A.rows()
+      >= opencl_context.tuning_opts().tri_inverse_size_worth_transfer) {
+    matrix_cl A_cl(A);
+    matrix_cl b_cl(b);
+    matrix_cl A_inv_cl(A.rows(), A.cols());
+    if (TriView == Eigen::Lower) {
+      A_inv_cl = tri_inverse<TriangularViewCL::Lower>(A_cl);
+    } else {
+      A_inv_cl = tri_inverse<TriangularViewCL::Upper>(A_cl);
+    }
+    matrix_cl C_cl = A_inv_cl * b_cl;
+    return from_matrix_cl(C_cl);
+  } else {
+#endif
+    return A.template triangularView<TriView>().solve(b);
+#ifdef STAN_OPENCL
+  }
+#endif
+}
+
+/**
+ * Returns the solution of the system Ax=b when A is triangular, b=I and
+ * both are matrices of doubles.
+ * @tparam TriView Specifies whether A is upper (Eigen::Upper)
+ * or lower triangular (Eigen::Lower).
+ * @tparam R1 number of rows in A
+ * @tparam C1 number of columns in A
+ * @param A Triangular matrix.
+ * @return x = A^-1 .
+ * @throws std::domain_error if A is not square
+ */
+template <int TriView, int R1, int C1>
+inline Eigen::Matrix<double, R1, C1> mdivide_left_tri(
+    const Eigen::Matrix<double, R1, C1> &A) {
+  check_square("mdivide_left_tri", "A", A);
+  const int n = A.rows();
+#ifdef STAN_OPENCL
+  if (A.rows()
+      >= opencl_context.tuning_opts().tri_inverse_size_worth_transfer) {
+    matrix_cl A_cl(A);
+    if (TriView == Eigen::Lower) {
+      A_cl = tri_inverse<TriangularViewCL::Lower>(A_cl);
+    } else {
+      A_cl = tri_inverse<TriangularViewCL::Upper>(A_cl);
+    }
+    return from_matrix_cl(A_cl);
+  } else {
+#endif
+    Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic> b;
+    b.setIdentity(n, n);
+    A.template triangularView<TriView>().solveInPlace(b);
+    return b;
+#ifdef STAN_OPENCL
+  }
+#endif
+}
+
 }  // namespace math
 }  // namespace stan
 #endif
diff --git a/stan/math/rev/mat/fun/cholesky_decompose.hpp b/stan/math/rev/mat/fun/cholesky_decompose.hpp
@@ -299,7 +299,7 @@ class cholesky_opencl : public vari {
     L_adj = opencl::multiply<TriangularViewCL::Upper, TriangularViewCL::Entire>(
         transpose(L), L_adj);
     L_adj.triangular_transpose<TriangularMapCL::LowerToUpper>();
-    L = transpose(lower_triangular_inverse(L));
+    L = transpose(tri_inverse<TriangularViewCL::Lower>(L));
     L_adj = L
             * transpose(opencl::multiply<TriangularViewCL::Upper,
                                          TriangularViewCL::Entire>(L, L_adj));
@@ -360,7 +360,7 @@ class cholesky_opencl : public vari {
 
       C_adj
           = opencl::multiply<TriangularViewCL::Entire, TriangularViewCL::Lower>(
-              C_adj, lower_triangular_inverse(D));
+              C_adj, tri_inverse<TriangularViewCL::Lower>(D));
       B_adj = B_adj - C_adj * R;
       D_adj = D_adj - transpose(C_adj) * C;
 
diff --git a/test/unit/math/opencl/lower_tri_inverse_test.cpp b/test/unit/math/opencl/lower_tri_inverse_test.cpp
diff --git a/test/unit/math/opencl/tri_inverse_test.cpp b/test/unit/math/opencl/tri_inverse_test.cpp
diff --git a/test/unit/math/prim/mat/fun/mdivide_left_tri_test.cpp b/test/unit/math/prim/mat/fun/mdivide_left_tri_test.cpp