Specialized mask kernel for to_mask(dim=2) (#466)

cpuhrsch · web-flow · commit 3300e3bc4239 · 2022-02-15T00:47:06.000-05:00
diff --git a/nestedtensor/csrc/cuda/mha.cpp b/nestedtensor/csrc/cuda/mha.cpp
@@ -48,10 +48,6 @@ at::Tensor bt_min_mha(
   // auto start = std::chrono::system_clock::now();
   auto options =
       torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA);
-  at::Tensor input_mask = to_mask(query, 2);
-  input_mask = input_mask.to(options);
-  int64_t batch_size = input_mask.size(0);
-  int64_t seq_len = input_mask.size(1);
   int64_t embedding_dim = head_dim * num_heads; //*(opt_sizes[2]);
   int64_t head_num = num_heads;
   int64_t size_per_head = embedding_dim / head_num;
@@ -65,6 +61,8 @@ at::Tensor bt_min_mha(
   at::Tensor query_buf = packed_padded_chunks[0];
   at::Tensor key_buf = packed_padded_chunks[1];
   at::Tensor val_buf = packed_padded_chunks[2];
+  int64_t batch_size = query_buf.size(0);
+  int64_t seq_len = query_buf.size(1);
 
   query_buf = query_buf.reshape({batch_size, seq_len, head_num, size_per_head}).transpose(1, 2);
   key_buf =     key_buf.reshape({batch_size, seq_len, head_num, size_per_head}).transpose(1, 2);
@@ -75,6 +73,8 @@ at::Tensor bt_min_mha(
 
   auto mask_options =
       torch::TensorOptions().dtype(query.dtype()).device(torch::kCUDA);
+  at::Tensor input_mask = to_mask(query, 2);
+  input_mask = input_mask.to(options);
   at::Tensor attr_mask = input_mask.view({-1, 1, 1, seq_len}).to(mask_options);
   attr_mask = attr_mask * attr_mask.transpose(2, 3);
 
diff --git a/nestedtensor/csrc/masking.cpp b/nestedtensor/csrc/masking.cpp
@@ -433,6 +433,22 @@ Tensor to_mask(
     for (int64_t i = 1; i < *mask_dim; i++) {
       max_size.push_back(tmp_max_size[i - 1]);
     }
+    if (*mask_dim == 2 && get_dim(nt) == 3) {
+      auto nt_size = get_efficient_nested_size(nt);
+      auto esizes = nt_size.sizes();
+      auto options = torch::TensorOptions().dtype(torch::kByte);
+      auto result = torch::zeros({*opt_sizes[0], tmp_max_size[0]},
+                                options);
+      uint8_t* result_data = result.data_ptr<uint8_t>();
+      int64_t* esizes_ptr = esizes.data_ptr<int64_t>();
+      for (int64_t i = 0; i < esizes.size(0); i++) {
+        int64_t length = esizes_ptr[i * esizes.size(1)];
+        for (int64_t j = 0; j < length; j++) {
+          result_data[i * result.size(1) + j] = 1;
+        }
+      }
+      return result;
+    }
     return _create_nt_mask(get_efficient_nested_size(nt), max_size);
   }
   max_size = get_max_size(nt);
@@ -525,13 +541,13 @@ Tensor _collapse_two_dims_3(Tensor input, int64_t dim1, int64_t dim2) {
   auto input_esizes = get_efficient_nested_size(input);
   Tensor nt_sizes = input_esizes.sizes();
 
-  Tensor sizes_dim1 = at::native::narrow(nt_sizes, 1, 0, 1).contiguous();
-  Tensor sizes_dim2 = at::native::narrow(nt_sizes, 1, 1, 1).contiguous();
+  Tensor sizes_dim1 = at::native::narrow(nt_sizes, 1, 0, 1);
+  Tensor sizes_dim2 = at::native::narrow(nt_sizes, 1, 1, 1);
 
   Tensor new_nt_sizes;
   if (dim1 == 1) {
     Tensor collapsed_sizes = sizes_dim1 * sizes_dim2;
-    new_nt_sizes = collapsed_sizes;
+    new_nt_sizes = collapsed_sizes.contiguous();
   }
   auto new_esizes = torch::nested_tensor::EfficientSizeNode(input_esizes.structure(), new_nt_sizes);
   Tensor result = wrap_buffer(get_buffer(input), new_esizes);
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.1.4+986cfd5'
-git_version = '986cfd55e2d0c8139a5e19cfca6efc740ea7ad23'
+__version__ = '0.1.4+5b45731'
+git_version = '5b457313bfb6578b43d76282b321657bf85ee1b3'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION