Seryilmaz/fused dropout softmax (NVIDIA#985)

* fuse dropout into softmax in fprop for additive mask case
lmw0320 · Dec 4, 2020 · 3fe10b5 · 3fe10b5
1 parent 6c186b3
commit 3fe10b5
Show file tree

Hide file tree

Showing 9 changed files with 1,019 additions and 140 deletions.
diff --git a/apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout_cuda.cu b/apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout_cuda.cu
@@ -113,15 +113,15 @@ torch::Tensor bwd_cuda(
 
   // Apply Dropout Mask and Scale by Dropout Probability 
   // Softmax Grad
-  dispatch_masked_scale_softmax_backward<half, half, float,false>(
+  dispatch_masked_scale_softmax_backward_stream<half, half, float,false>(
                              static_cast<half*>(output_grads.data_ptr()), 
                              static_cast<half*>(output_grads.data_ptr()), 
                              reinterpret_cast<half const*>(softmax_results.data_ptr()),
 			     static_cast<uint8_t const*>(dropout_mask.data_ptr()),
 			     1.0/(1.0-dropout_prob),
                              k_seq_len,
                              k_seq_len,
-                             attn_batches*q_seq_len);
+                             attn_batches*q_seq_len, stream);
 //backward pass is completely in-place
   return output_grads;
 }

diff --git a/apex/contrib/csrc/multihead_attn/masked_softmax_dropout_cuda.cu b/apex/contrib/csrc/multihead_attn/masked_softmax_dropout_cuda.cu
@@ -115,17 +115,17 @@ torch::Tensor bwd_cuda(
   // Apply Dropout Mask and Scale by Dropout Probability 
   // Softmax Grad
   if (padding_mask == nullptr) {
-      dispatch_masked_scale_softmax_backward<half, half, float,false>(
+      dispatch_masked_scale_softmax_backward_stream<half, half, float,false>(
                              static_cast<half*>(output_grads.data_ptr()), 
                              static_cast<half*>(output_grads.data_ptr()), 
                              reinterpret_cast<half const*>(softmax_results.data_ptr()),
 			     static_cast<uint8_t const*>(dropout_mask.data_ptr()),
 			     1.0/(1.0-dropout_prob),
                              k_seq_len,
                              k_seq_len,
-                             attn_batches*q_seq_len);
+                             attn_batches*q_seq_len, stream);
   } else{
-      dispatch_masked_scale_softmax_backward_masked_out<half, half, float,false>(
+      dispatch_masked_scale_softmax_backward_masked_out_stream<half, half, float,false>(
                              static_cast<half*>(output_grads.data_ptr()), 
                              static_cast<half*>(output_grads.data_ptr()), 
                              reinterpret_cast<half const*>(softmax_results.data_ptr()),
@@ -135,7 +135,7 @@ torch::Tensor bwd_cuda(
                              k_seq_len,
                              k_seq_len,
                              attn_batches*q_seq_len,
-			     heads); 
+			     heads, stream); 
 
   }
 //backward pass is completely in-place

diff --git a/apex/contrib/csrc/multihead_attn/philox.h b/apex/contrib/csrc/multihead_attn/philox.h
@@ -0,0 +1,90 @@
+#pragma once
+//Philox CUDA. 
+
+class Philox {
+public:
+  __device__ inline Philox(unsigned long long seed,
+                           unsigned long long subsequence,
+                           unsigned long long offset) {
+    key.x = (unsigned int)seed;
+    key.y = (unsigned int)(seed >> 32);
+    counter = make_uint4(0, 0, 0, 0);
+    counter.z = (unsigned int)(subsequence);
+    counter.w = (unsigned int)(subsequence >> 32);
+    STATE = 0;
+    incr_n(offset / 4);
+  }
+  __device__ inline uint4 operator()() {
+    if(STATE == 0) {
+      uint4 counter_ = counter;
+      uint2 key_ = key;
+      //7-round philox
+      for(int i = 0; i < 6; i++) {
+        counter_ = single_round(counter_, key_);
+        key_.x += (kPhilox10A); key_.y += (kPhilox10B);
+      }
+      output = single_round(counter_, key_);
+      incr();
+    }
+    //return a float4 directly
+    //unsigned long ret;
+    //switch(STATE) {
+    //  case 0: ret = output.x; break;
+    //  case 1: ret = output.y; break;
+    //  case 2: ret = output.z; break;
+    //  case 3: ret = output.w; break;
+    //}
+    //STATE = (STATE + 1) % 4;
+    return output;
+  }
+private:
+  uint4 counter;
+  uint4 output;
+  uint2 key;
+  unsigned int STATE;
+  __device__ inline void incr_n(unsigned long long n) {
+    unsigned int nlo = (unsigned int)(n);
+    unsigned int nhi = (unsigned int)(n >> 32);
+    counter.x += nlo;
+    if (counter.x < nlo)
+      nhi++;
+    counter.y += nhi;
+    if (nhi <= counter.y)
+      return;
+    if (++counter.z)
+      return;
+    ++counter.w;
+  }
+  __device__ inline void incr() {
+    if (++counter.x)
+      return;
+    if (++counter.y)
+      return;
+    if (++counter.z)
+      return;
+    ++counter.w;
+  }
+  __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
+                                    unsigned int *result_high) {
+    *result_high = __umulhi(a, b);
+    return a*b;
+  }
+  __device__ inline uint4 single_round(uint4 ctr, uint2 key) {
+    unsigned int hi0;
+    unsigned int hi1;
+    unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
+    unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
+    uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
+    return ret;
+  }
+  static const unsigned long kPhilox10A = 0x9E3779B9;
+  static const unsigned long kPhilox10B = 0xBB67AE85;
+  static const unsigned long kPhiloxSA = 0xD2511F53;
+  static const unsigned long kPhiloxSB = 0xCD9E8D57;
+};
+// Inverse of 2^32.
+#define M_RAN_INVM32 2.3283064e-10f
+__device__  __inline__ float4 uniform4(uint4 x) {
+    return make_float4(x.x * M_RAN_INVM32, x.y * M_RAN_INVM32, x.z * M_RAN_INVM32,x.w * M_RAN_INVM32);
+
+}
diff --git a/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask.cpp b/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask.cpp
@@ -24,7 +24,9 @@ std::vector<torch::Tensor> bwd_cuda(
                                torch::Tensor const& output_grads, 
                                torch::Tensor const& matmul2_results,
                                torch::Tensor const& dropout_results,
-                               torch::Tensor const& softmax_results,
+                              // torch::Tensor const& softmax_results,
+                               torch::Tensor const& bmm1_results,
+                               torch::Tensor const& pad_mask,
                                torch::Tensor const& input_lin_results,
                                torch::Tensor const& inputs, 
                                torch::Tensor const& input_weights,
@@ -60,6 +62,7 @@ std::vector<torch::Tensor> fwd(
   AT_ASSERTM(inputs.type().scalarType()         == at::ScalarType::Half, "Only HALF is supported");
   AT_ASSERTM(input_weights.type().scalarType()  == at::ScalarType::Half, "Only HALF is supported");
   AT_ASSERTM(output_weights.type().scalarType() == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(use_mask                                                  , "no mask is not supported");
 
   if (use_mask) {
   	AT_ASSERTM(pad_mask.dim()                     == 2,                    "expected 2D tensor");
@@ -85,7 +88,8 @@ std::vector<torch::Tensor> bwd(
                                torch::Tensor const& output_grads, 
                                torch::Tensor const& matmul2_results,
                                torch::Tensor const& dropout_results,
-                               torch::Tensor const& softmax_results,
+                               torch::Tensor const& bmm1_results,
+                               torch::Tensor const& pad_mask,
                                torch::Tensor const& input_lin_results,
                                torch::Tensor const& inputs, 
                                torch::Tensor const& input_weights,
@@ -97,7 +101,6 @@ std::vector<torch::Tensor> bwd(
   AT_ASSERTM(output_grads.dim()      == 3, "expected 3D tensor");
   AT_ASSERTM(matmul2_results.dim()   == 3, "expected 3D tensor");
   AT_ASSERTM(dropout_results.dim()   == 3, "expected 3D tensor");
-  AT_ASSERTM(softmax_results.dim()   == 3, "expected 3D tensor");
   AT_ASSERTM(input_lin_results.dim() == 3, "expected 3D tensor");
   AT_ASSERTM(inputs.dim()            == 3, "expected 3D tensor");
   AT_ASSERTM(input_weights.dim()     == 2, "expected 2D tensor");
@@ -107,7 +110,6 @@ std::vector<torch::Tensor> bwd(
   AT_ASSERTM(output_grads.type().scalarType()      == at::ScalarType::Half, "Only HALF is supported");
   AT_ASSERTM(matmul2_results.type().scalarType()   == at::ScalarType::Half, "Only HALF is supported");
   AT_ASSERTM(dropout_results.type().scalarType()   == at::ScalarType::Half, "Only HALF is supported");
-  AT_ASSERTM(softmax_results.type().scalarType()   == at::ScalarType::Half, "Only HALF is supported");
   AT_ASSERTM(input_lin_results.type().scalarType() == at::ScalarType::Half, "Only HALF is supported");
   AT_ASSERTM(inputs.type().scalarType()            == at::ScalarType::Half, "Only HALF is supported");
   AT_ASSERTM(input_weights.type().scalarType()     == at::ScalarType::Half, "Only HALF is supported");
@@ -119,7 +121,8 @@ std::vector<torch::Tensor> bwd(
                                  output_grads,
                                  matmul2_results,
                                  dropout_results,
-                                 softmax_results, 
+				 bmm1_results,
+				 pad_mask, 
                                  input_lin_results, 
                                  inputs, 
                                  input_weights,

diff --git a/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu b/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu
@@ -63,7 +63,7 @@ std::vector<torch::Tensor> fwd_cuda(
   auto mask_options = act_options.dtype(torch::kUInt8);
 
   torch::Tensor input_lin_results = torch::empty({q_seq_len, sequences, output_lin_dim}, act_options);
-  torch::Tensor softmax_results   = torch::empty({attn_batches, q_seq_len, k_seq_len},   act_options);
+  torch::Tensor bmm1_results   = torch::empty({attn_batches, q_seq_len, k_seq_len},   act_options);
   torch::Tensor dropout_results   = torch::empty({attn_batches, q_seq_len, k_seq_len},   act_options);
   torch::Tensor dropout_mask      = torch::empty({attn_batches, q_seq_len, k_seq_len},   mask_options);
   torch::Tensor matmul2_results   = torch::empty({q_seq_len, attn_batches, head_dim},    act_options);
@@ -75,7 +75,8 @@ std::vector<torch::Tensor> fwd_cuda(
   void* v_lin_results_ptr   = static_cast<void*>(static_cast<half*>(input_lin_results.data_ptr()) + 2*head_dim);
 
   // Softmax Intermediate Result Ptr (used by Matmul1 -> Softmax)
-  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
+  void* bmm1_results_ptr = static_cast<void*>(bmm1_results.data_ptr());
+  void* dropout_results_ptr = static_cast<void*>(dropout_results.data_ptr());
 
   char a_layout_t{'t'};
   char a_layout_n{'n'};
@@ -119,38 +120,36 @@ std::vector<torch::Tensor> fwd_cuda(
                              lead_dim, 
                              batch_stride, 
                              beta_zero, 
-                             static_cast<half*>(softmax_results_ptr), 
+                             static_cast<half*>(bmm1_results_ptr), 
                              k_seq_len, 
                              k_seq_len*q_seq_len, 
                              attn_batches);
   // Padded Softmax
   bool softmax_success = false;
-  if (pad_mask == nullptr) {
-    softmax_success = dispatch_softmax<half, half, float>(
-                             reinterpret_cast<half*>(softmax_results_ptr),
-                             reinterpret_cast<const half*>(softmax_results_ptr),
-                             k_seq_len,
-                             k_seq_len,
-                             attn_batches*q_seq_len);
+  if (is_training) {
+      softmax_success = dispatch_additive_masked_softmax_dropout<half, half, float>(
+                           reinterpret_cast<half*>(dropout_results_ptr),
+                           (is_training) ? reinterpret_cast<uint8_t*>(dropout_mask.data_ptr<uint8_t>()) : nullptr,
+                           reinterpret_cast<const half*>(bmm1_results_ptr),
+                           pad_mask,
+      		           attn_batches*q_seq_len*q_seq_len,
+                           k_seq_len,
+                           k_seq_len,
+                           attn_batches*q_seq_len,
+                           attn_batches*q_seq_len/sequences, 
+      		           1.0f-dropout_prob,
+		           stream);
   } else {
       softmax_success = dispatch_additive_masked_softmax<half, half, float>(
-                             reinterpret_cast<half*>(softmax_results_ptr),
-                             reinterpret_cast<const half*>(softmax_results_ptr),
+                             reinterpret_cast<half*>(dropout_results_ptr),//this is actually softmax results, but making it consistent for the next function
+                             reinterpret_cast<const half*>(bmm1_results_ptr),
                              pad_mask,
                              k_seq_len,
                              k_seq_len,
                              attn_batches*q_seq_len,
                              attn_batches*q_seq_len/sequences);
   }
 
-
-  if (is_training) {
-    //use at:: function so that C++ version generates the same random mask as python version
-    auto dropout_tuple = at::_fused_dropout(softmax_results, 1.0f-dropout_prob);
-    dropout_results = std::get<0>(dropout_tuple);
-    dropout_mask = std::get<1>(dropout_tuple);
-  }
-
   // Matmul2
   gemm_switch_fp32accum(     state, 
                              a_layout_n, 
@@ -162,7 +161,7 @@ std::vector<torch::Tensor> fwd_cuda(
                              static_cast<const half*>(v_lin_results_ptr), 
                              lead_dim, 
                              batch_stride, 
-                             (is_training) ? static_cast<const half*>(dropout_results.data_ptr()) : static_cast<const half*>(softmax_results.data_ptr()) , 
+                             static_cast<const half*>(dropout_results.data_ptr()), 
                              k_seq_len, 
                              k_seq_len*q_seq_len, 
                              beta_zero, 
@@ -199,7 +198,7 @@ std::vector<torch::Tensor> fwd_cuda(
 
   return {
            input_lin_results,  
-           softmax_results,
+           bmm1_results,
            dropout_results, 
            dropout_mask, 
            matmul2_results, 
@@ -212,7 +211,8 @@ std::vector<torch::Tensor> bwd_cuda(
                                torch::Tensor const& output_grads, 
                                torch::Tensor const& matmul2_results,
                                torch::Tensor const& dropout_results,
-                               torch::Tensor const& softmax_results,
+                               torch::Tensor const& bmm1_results,
+                               torch::Tensor const& pad_mask,
                                torch::Tensor const& input_lin_results,
                                torch::Tensor const& inputs, 
                                torch::Tensor const& input_weights,
@@ -350,15 +350,18 @@ std::vector<torch::Tensor> bwd_cuda(
 
   // Apply Dropout Mask and Scale by Dropout Probability 
   // Softmax Grad
-  dispatch_masked_scale_softmax_backward<half, half, float,false>(
+  dispatch_masked_scale_softmax_backward_recompute<half, half, float, false>(
                              static_cast<half*>(matmul2_grads.data_ptr()), 
-                             static_cast<half*>(matmul2_grads.data_ptr()), 
-                             reinterpret_cast<half const*>(softmax_results.data_ptr()),
+                             static_cast<half* const>(matmul2_grads.data_ptr()), 
+                             reinterpret_cast<half const*>(bmm1_results.data_ptr()),
+                             reinterpret_cast<half const*>(pad_mask.data_ptr()),
 			     static_cast<uint8_t const*>(dropout_mask.data_ptr()),
 			     1.0/(1.0-dropout_prob),
                              k_seq_len,
                              k_seq_len,
-                             attn_batches*q_seq_len);
+			     attn_batches*q_seq_len/sequences,
+                             attn_batches*q_seq_len,
+			     stream);
 
   // Matmul1 Dgrad1
   gemm_switch_fp32accum(     state, 

diff --git a/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu b/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu
@@ -361,15 +361,15 @@ std::vector<torch::Tensor> bwd_cuda(
 
   // Apply Dropout Mask and Scale by Dropout Probability 
   // Softmax Grad
-  dispatch_masked_scale_softmax_backward<half, half, float,false>(
+  dispatch_masked_scale_softmax_backward_stream<half, half, float,false>(
                              static_cast<half*>(matmul2_grads.data_ptr()), 
                              static_cast<half*>(matmul2_grads.data_ptr()), 
                              reinterpret_cast<half const*>(softmax_results.data_ptr()),
 			     static_cast<uint8_t const*>(dropout_mask.data_ptr()),
 			     1.0/(1.0-dropout_prob),
                              k_seq_len,
                              k_seq_len,
-                             attn_batches*q_seq_len);
+                             attn_batches*q_seq_len, stream);
 
   // Matmul1 Dgrad1
   gemm_switch_fp32accum(     state,