lithuak
diff --git a/‎aten/src/ATen/native/native_functions.yaml
+3-3 b/‎aten/src/ATen/native/native_functions.yaml
+3-3
diff --git a/‎aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+1 b/‎aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+1
diff --git a/‎aten/src/ATen/native/transformers/attention.cpp
+66-16 b/‎aten/src/ATen/native/transformers/attention.cpp
+66-16
diff --git a/‎aten/src/ATen/native/transformers/cuda/attention.cu
+15-5 b/‎aten/src/ATen/native/transformers/cuda/attention.cu
+15-5
diff --git a/‎aten/src/ATen/native/transformers/cuda/attention_backward.cu
+31-15 b/‎aten/src/ATen/native/transformers/cuda/attention_backward.cu
+31-15
@@ -14179,13 +14179,13 @@
   dispatch:
     CUDA: _scaled_dot_product_flash_attention_backward_cuda
 
-- func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
+- func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
   dispatch:
     CUDA: _scaled_dot_product_efficient_attention_cuda
     NestedTensorCUDA: _scaled_dot_product_efficient_attention_nestedtensor_cuda
   tags: nondeterministic_seeded
 
-- func: _scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, float dropout_p, bool is_causal=False, *, float? scale=None) -> (Tensor, Tensor, Tensor)
+- func: _scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor attn_bias, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, float dropout_p, bool[4] grad_input_mask, bool is_causal=False, *, float? scale=None) -> (Tensor, Tensor, Tensor, Tensor)
   device_check: NoCheck
   dispatch:
     CUDA: _scaled_dot_product_efficient_attention_backward_cuda
@@ -14210,7 +14210,7 @@
     CUDA: _efficient_attention_forward
   tags: nondeterministic_seeded
 
-- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int max_seqlen_k, int max_seqlen_q, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor)
+- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int max_seqlen_k, int max_seqlen_q, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor)
   device_check: NoCheck
   variants: function
   dispatch:
 
@@ -743,6 +743,7 @@ _scaled_dot_product_efficient_attention_nestedtensor_cuda(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
+    const c10::optional<at::Tensor>&  attn_bias,
     bool compute_log_sumexp,
     double dropout_p,
     bool is_causal,
 
@@ -1,4 +1,5 @@
 #include <type_traits>
+#include <limits>
 #include <c10/core/DeviceType.h>
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
@@ -10,13 +11,14 @@
 #include <ATen/cpu/vec/vec256/vec256.h>
 #include <ATen/native/transformers/attention.h>
 #include <ATen/native/transformers/sdp_utils_cpp.h>
-#include <type_traits>
 #include <utility>
+#include <c10/util/typeid.h>
 #include <c10/core/SymIntArrayRef.h>
 #include <c10/util/Logging.h>
 #include <c10/util/Exception.h>
 #include <c10/core/DispatchKey.h>
 #include <c10/core/DispatchKeySet.h>
+#include <ATen/TensorSubclassLikeUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/NativeFunctions.h>
@@ -509,6 +511,7 @@ int64_t _fused_sdp_choice_meta(
   }
   return static_cast<int64_t>(sdp::SDPBackend::math);
 }
+namespace {
 
 inline void validate_sdpa_input(
     const Tensor& query_,
@@ -535,9 +538,53 @@ inline void validate_sdpa_input(
     TORCH_CHECK(mask_dtype == at::kBool || mask_dtype == query_.dtype(),
       "Expected attn_mask dtype to be bool or to match query dtype, but got attn_mask.dtype: ",
       mask_dtype, " and  query.dtype: ", query_.dtype(), " instead.");
+    TORCH_CHECK(
+      !query_.is_nested() && !key.is_nested(),
+      "Scaled_dot_product_attention: Nested tensors for query / key are not supported "
+      "when an explicit attn_mask is set");
   }
   return;
 }
+// This function is used to produce an attn_mask
+// in a standard format that can be consumed by both
+// the math and memory efficient attn_mask implementation
+//  Args:
+//    attn_mask: attn_mask of shape (B, L, S) or (L, S) or (B, N_heads, L, S)
+c10::optional<Tensor> convert_boolean_attn_mask(const c10::optional<Tensor>& attn_mask, caffe2::TypeMeta dtype) {
+  // Pass through
+  if(!attn_mask.has_value()){
+    return c10::nullopt;
+  }
+  // Convert boolean mask to additive mask; need to invert mask to indicate what
+  // to mask *out*.
+  if (attn_mask->dtype() == at::kBool) {
+    auto new_attn_mask = at::zeros_like(attn_mask.value(), dtype);
+    // TODO Use the max type of the input and output
+    new_attn_mask.masked_fill_(
+        attn_mask->logical_not(), -std::numeric_limits<double>::infinity());
+    return new_attn_mask;
+  }
+  // Otherwise, attn_mask represents an additive attention tensor
+  return attn_mask;
+}
+// Memory Efficient Attention requires a padded attn mask bias
+// This function pads the attn_mask bias to be a multiple of 16
+// Then slices the padded bias to the original size
+// We apply this function to the top level SDPA so that
+// if padding is done it will be tracked for backward automatically
+at::Tensor pad_bias(const at::Tensor& attn_bias) {
+  int align_to = 16;
+  auto last_dim_size = attn_bias.sym_size(-1);
+  if (last_dim_size % align_to == 0) {
+    return attn_bias;
+  }
+  auto pad_count = align_to - (last_dim_size % align_to);
+  auto padded_bias = at::pad_symint(attn_bias, {c10::SymInt(0), pad_count});
+  return padded_bias.slice_symint(-1, 0, last_dim_size);
+}
+
+} // namespace
+
 // Computes scaled dot product attention on query, key and value tensors, using
 // an optional attention mask if passed, and applying dropout if a probability
 // greater than 0.0 is specified.
@@ -581,6 +628,7 @@ Tensor scaled_dot_product_attention(
       query_, key, value, attn_mask_, dropout_p, is_causal, scale);
   }
   sdp::SDPBackend backend = static_cast<sdp::SDPBackend>(choice_int);
+  c10::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
   switch (backend) {
     case sdp::SDPBackend::flash_attention: {
       auto out_lse_softmax = at::_scaled_dot_product_flash_attention(
@@ -591,16 +639,25 @@ Tensor scaled_dot_product_attention(
       bool compute_logsumexp =
           (query_.requires_grad() || key.requires_grad() ||
            value.requires_grad());
+      if (attn_mask.has_value()) {
+        // Expand to 4d case
+        attn_mask = attn_mask.value().expand_symint(
+            {query_.sym_size(0),
+             query_.sym_size(1),
+             query_.sym_size(2),
+             key.sym_size(2)});
+        attn_mask = pad_bias(attn_mask.value());
+      }
       auto out_and_lse = at::_scaled_dot_product_efficient_attention(
-          query_, key, value, compute_logsumexp, dropout_p, is_causal, scale);
+          query_, key, value, attn_mask, compute_logsumexp, dropout_p, is_causal, scale);
       return std::get<0>(out_and_lse);
     }
     case sdp::SDPBackend::math:
       return std::get<0>(at::_scaled_dot_product_attention_math(
           query_,
           key,
           value,
-          attn_mask_,
+          attn_mask,
           dropout_p,
           is_causal,
           c10::nullopt, /*dropout_mask*/
@@ -639,22 +696,15 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math(
         // Replace attn_mask with causal mask; lower triangular elements take part in attention.
         const auto L = query.sym_size(-2), S = key.sym_size(-2);
         attn_mask = at::ones_symint({L, S}, query.options().dtype(at::kBool)).tril();
+        attn_mask = convert_boolean_attn_mask(attn_mask, query.dtype());
     }
+    auto attn = at::matmul(query, key.transpose(-2, -1) * scaling_factor);
     if (attn_mask.has_value()) {
-        TORCH_CHECK(!query.is_nested() && !key.is_nested(),
-                "_scaled_dot_product_attention: Nested tensors for query / key are not supported "
-                "when an explicit attn_mask is set");
-        // Convert boolean mask to additive mask; need to invert mask to indicate what to mask *out*.
-        if (attn_mask->dtype() == at::kBool){
-          auto new_attn_mask = at::zeros_like(*attn_mask, query.dtype());
-          new_attn_mask.masked_fill_(attn_mask->logical_not(), -std::numeric_limits<double>::infinity());
-          attn_mask = new_attn_mask;
-        }
-        // Otherwise, attn_mask represents an additive attention tensor
-    }
-    auto attn = at::matmul(query, key.transpose(-2, -1)*scaling_factor);
-    if (attn_mask.has_value()) {
+      if (at::areAnyTensorSubclassLike({attn, *attn_mask})) {
+        attn = attn.add(*attn_mask);
+      } else {
         attn.add_(*attn_mask);
+      }
     }
     attn = at::softmax(attn, -1);
     if (dropout_p > 0.0) {
 
@@ -524,7 +524,11 @@ std::tuple<Tensor, Tensor> native_multi_head_attention_cuda(
     // strides from packed projection for nested tensors when seq_len is 1 will be
     // and will trigger a contiguous call in the kernel, so we prevent this
     bool no_seq_len_1_nested = query.is_nested() ? check_for_seq_len_1_nested_tensor(kernel_params, false) : true;
-    if (no_seq_len_1_nested &&
+    // The API for transfomer_encoder is a mask of shape (Batch_Size, Seq_len_q)
+    // For mem-eff attention this will cause the expand call to error
+    // For now I am going to turn of that path not have to deal with all the annoying
+    // Mask type shape grossness
+    if (!mask.has_value() && no_seq_len_1_nested &&
         (backend == sdp::SDPBackend::flash_attention || backend == sdp::SDPBackend::efficient_attention)) {
       auto x = at::linear(query, qkv_weight, qkv_bias);
       auto chunks = x.chunk(3, -1);
@@ -536,7 +540,6 @@ std::tuple<Tensor, Tensor> native_multi_head_attention_cuda(
                       .transpose(1, 2);
       chunks[2] = (chunks[2].view({x_size_0, -1, num_head, dim_per_head}))
                       .transpose(1, 2);
-
       auto y = at::scaled_dot_product_attention(
           chunks[0], chunks[1], chunks[2], mask, 0.0, false, c10::nullopt);
 
@@ -712,6 +715,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _scaled_dot_product_efficient_attenti
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
+    const c10::optional<at::Tensor>& attn_bias,
     bool compute_log_sumexp,
     double dropout_p,
     bool is_causal,
@@ -733,7 +737,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _scaled_dot_product_efficient_attenti
       q_t,
       k_t,
       v_t,
-      c10::nullopt,
+      attn_bias,
       c10::nullopt,
       c10::nullopt,
       c10::nullopt,
@@ -1045,8 +1049,14 @@ std::tuple<at::Tensor, at::Tensor, Tensor, Tensor> _efficient_attention_forward(
 
       // assign strides for bias, viewed as
       // (batch_sz, n_heads, n_queries, n_keys)
-      const at::Tensor bias_4d_view =
-          get_bias_4d_view(*bias, B, num_heads, M, N);
+      // We make sure to expand prior to calling the kernel
+      const at::Tensor& bias_4d_view = *bias;
+      TORCH_CHECK(bias_4d_view.dim()==4);
+      TORCH_CHECK(bias_4d_view.size(0)==B);
+      TORCH_CHECK(bias_4d_view.size(1)==num_heads);
+      TORCH_CHECK(bias_4d_view.size(2)==M);
+      TORCH_CHECK(bias_4d_view.size(3)==N);
+
       ASSIGN_CHECK_OVERFLOW(p.bias_strideB, bias_4d_view.stride(0));
       ASSIGN_CHECK_OVERFLOW(p.bias_strideH, bias_4d_view.stride(1));
       ASSIGN_CHECK_OVERFLOW(p.bias_strideM, bias_4d_view.stride(2));
 
@@ -115,6 +115,7 @@ _efficient_attention_backward(
     const at::Tensor& philox_seed, // seed using for generating random numbers for dropout
     const at::Tensor& philox_offset, // offset into random number sequence
     int64_t custom_mask_type,
+    const bool bias_requires_grad,
     const c10::optional<double> scale,
     c10::optional <int64_t> num_splits_key) {
   #if defined(USE_FLASH_ATTENTION)
@@ -187,8 +188,6 @@ _efficient_attention_backward(
   int64_t K = query.size(3);
   int64_t Kv = value.size(3);
 
-  const bool bias_requires_grad = bias.has_value() && bias->requires_grad();
-
   at::Tensor grad_q, grad_k, grad_v, grad_bias;
   grad_q = at::empty(query.sizes(), query.options());
   grad_k = at::empty(key.sizes(), key.options());
@@ -344,7 +343,13 @@ _efficient_attention_backward(
 
       // assign strides for bias, viewed as:
       // (batch_sz, n_heads, n_queries, n_keys)
-      const at::Tensor bias_4d_view = get_bias_4d_view(*bias, B, nH, M, N);
+      // We make sure to expand prior to calling the kernel
+      const at::Tensor& bias_4d_view = *bias;
+      TORCH_CHECK(bias_4d_view.dim()==4);
+      TORCH_CHECK(bias_4d_view.size(0)==B);
+      TORCH_CHECK(bias_4d_view.size(1)==nH);
+      TORCH_CHECK(bias_4d_view.size(2)==M);
+      TORCH_CHECK(bias_4d_view.size(3)==N);
       ASSIGN_CHECK_OVERFLOW(p.bias_strideB, bias_4d_view.stride(0));
       ASSIGN_CHECK_OVERFLOW(p.bias_strideH, bias_4d_view.stride(1));
       ASSIGN_CHECK_OVERFLOW(p.bias_strideM, bias_4d_view.stride(2));
@@ -359,8 +364,14 @@ _efficient_attention_backward(
         // different values of Q will point to the same memory
         // locations, meaning bias.stride(1) == 0, while we'd want
         // grad_bias.stride(1) == nK
-        const at::Tensor grad_bias_4d_view =
-            get_bias_4d_view(grad_bias, B, nH, M, N);
+        // We have expanded the input prior to calling the forward kernel
+        const at::Tensor& grad_bias_4d_view = grad_bias;
+        TORCH_CHECK(grad_bias_4d_view.dim()==4);
+        TORCH_CHECK(grad_bias_4d_view.size(0)==B);
+        TORCH_CHECK(grad_bias_4d_view.size(1)==nH);
+        TORCH_CHECK(grad_bias_4d_view.size(2)==M);
+        TORCH_CHECK(grad_bias_4d_view.size(3)==N);
+
         ASSIGN_CHECK_OVERFLOW(p.gB_strideB, grad_bias_4d_view.stride(0));
         ASSIGN_CHECK_OVERFLOW(p.gB_strideH, grad_bias_4d_view.stride(1));
         ASSIGN_CHECK_OVERFLOW(p.gB_strideM, grad_bias_4d_view.stride(2));
@@ -531,20 +542,23 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_flash_attenti
 }
 
 
-std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_efficient_attention_backward_cuda(
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_efficient_attention_backward_cuda(
     const at::Tensor& grad_out_,
     const at::Tensor& query,
     const at::Tensor& key,
     const at::Tensor& value,
+    const at::Tensor& attn_bias,
     const at::Tensor& out,
     const at::Tensor& logsumexp,
     const at::Tensor& philox_seed,
     const at::Tensor& philox_offset,
     double dropout_p,
+    std::array<bool, 4> grad_input_mask,
     bool causal,
-    c10::optional<double> scale){
+    c10::optional<double> scale) {
+
   if (!grad_out_.defined()) {
-    return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
+    return std::make_tuple(Tensor{}, Tensor{}, Tensor{}, Tensor{});
   }
   auto grad_out = grad_out_.transpose(1, 2);
   auto out_t = out.transpose(1, 2);
@@ -554,10 +568,12 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_efficient_att
 
   Tensor grad_q, grad_k, grad_v, grad_bias;
 
-  // TODO_DRISS
-  // These are place holders unitl we add support for bias
-  auto bias = c10::nullopt;
-
+  // This is needed because SaveVarible automatically converts
+  // c10::optional to undefined tensor
+  c10::optional<Tensor> kernel_bias;
+  if (attn_bias.defined()) {
+    kernel_bias = attn_bias;
+  }
   // Will add with signauter changes for dropout and bias
   // We are only handiling Dense inputs, but this should be passed
   // from forward to backward
@@ -567,14 +583,13 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_efficient_att
   sdp::CustomMaskType custom_mask_type = causal
     ? sdp::CustomMaskType::CausalFromTopLeft
     : sdp::CustomMaskType::NoCustomMask;
-
   std::tie(grad_q, grad_k, grad_v, grad_bias) =
       at::_efficient_attention_backward(
           grad_out,
           q_t,
           k_t,
           v_t,
-          bias,
+          kernel_bias,
           out_t,
           c10::nullopt,
           c10::nullopt,
@@ -585,10 +600,11 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_efficient_att
           philox_seed,
           philox_offset,
           static_cast<int64_t>(custom_mask_type),
+          grad_input_mask[3],
           scale,
           c10::nullopt);  // num_split_keys
   return std::make_tuple(
-      grad_q.transpose(1, 2), grad_k.transpose(1, 2), grad_v.transpose(1, 2));
+      grad_q.transpose(1, 2), grad_k.transpose(1, 2), grad_v.transpose(1, 2), grad_bias);
 }
 
 } // namespace native