ROCm · anhskrttt · Jun 19, 2024 · Jun 28, 2024 · Jul 3, 2024 · Jul 7, 2024
@@ -39,3 +39,4 @@ The MIOpen API library is structured as follows:
   * :doc:`ReLU <../doxygen/html/group___re_l_u>` (experimental)
   * :doc:`Kthvalue <../doxygen/html/group__kthvalue>` (experimental)
   * :doc:`GLU <../doxygen/html/group__glu>` (experimental)
+  * :doc:`LogCumSumExp <./group___log_cum_sum_exp>` (experimental)
@@ -48,6 +48,7 @@ add_executable(MIOpenDriver
     dm_groupnorm.cpp
     dm_kthvalue.cpp
     dm_layernorm.cpp
+    dm_logcumsumexp.cpp
     dm_lrn.cpp
     dm_multimarginloss.cpp
     dm_pool.cpp

@@ -0,0 +1,41 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "registry_driver_maker.hpp"
+#include "logcumsumexp_driver.hpp"
+
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "logcumsumexp")
+        return new LogCumSumExpDriver<float, float>();
+    if(base_arg == "logcumsumexpfp16")
+        return new LogCumSumExpDriver<float16, float>();
+    if(base_arg == "logcumsumexpbfp16")
+        return new LogCumSumExpDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
@@ -314,7 +314,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
            "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], "
            "prelu[bfp16|fp16], kthvalue[bfp16|fp16], glu[bfp16|fp16], softmarginloss[bfp16|fp16], "
-           "multimarginloss[bfp16|fp16]\n");
+           "multimarginloss[bfp16|fp16], logcumsumexp[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -352,6 +352,7 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "kthvaluebfp16" && arg != "glu" && arg != "glufp16" && arg != "glubfp16" &&
        arg != "softmarginloss" && arg != "softmarginlossfp16" && arg != "softmarginlossbfp16" &&
        arg != "multimarginloss" && arg != "multimarginlossfp16" && arg != "multimarginlossbfp16" &&
+       arg != "logcumsumexp" && arg != "logcumsumexpfp16" && arg != "logcumsumexpbfp16" &&
        arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");

@@ -0,0 +1,183 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <limits>
+
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#include <../test/ford.hpp>
+
+template <typename Tgpu, typename Tcheck>
+int mloLogCumSumExpForwardRunHost(const miopenTensorDescriptor_t inputDesc,
+                                  const miopenTensorDescriptor_t outputDesc,
+                                  const Tgpu* input,
+                                  Tcheck* output_host,
+                                  const int dim,
+                                  const bool exclusive,
+                                  const bool reverse)
+{
+    const int ndims     = miopen::deref(inputDesc).GetNumDims();
+    const auto exec_dim = ((dim % ndims) + ndims) % ndims;
+
+    auto input_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
+    auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc));
+
+    auto size       = miopen::deref(inputDesc).GetElementSize();
+    auto inner_size = miopen::deref(inputDesc).GetLengths()[exec_dim];
+    auto outer_size = size / inner_size;
+
+    tensor_view_t<5> ignore_dim_input_tv = input_tv;
+    ignore_dim_input_tv.size[exec_dim]   = 1;
+
+    par_ford(outer_size)([&](int gid) {
+        auto tensor_layout = tensor_layout_t<5>(ignore_dim_input_tv, gid);
+        float cumsum       = 0;
+
+        ford(inner_size)([&](int idx) {
+            int tmp_idx =
+                (reverse ? input_tv.size[exec_dim] - (idx - exclusive) - 1 : (idx - exclusive));
+            float tmp_val = 0;
+            if(0 <= tmp_idx && tmp_idx < inner_size)
+            {
+                tensor_layout.layout[exec_dim] = tmp_idx;
+                tmp_val                        = std::exp(
+                    static_cast<float>(input[input_tv.get_tensor_view_idx(tensor_layout)]));
+            }
+
+            cumsum += tmp_val;
+
+            tensor_layout.layout[exec_dim] = (reverse ? input_tv.size[exec_dim] - idx - 1 : idx);
+            output_host[output_tv.get_tensor_view_idx(tensor_layout)] =
+                static_cast<Tcheck>(std::log(cumsum));
+        });
+    });
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tcheck>
+int mloLogCumSumExpBackwardRunHost(const miopenTensorDescriptor_t inputDesc,
+                                   const miopenTensorDescriptor_t outputDesc,
+                                   const miopenTensorDescriptor_t doutputDesc,
+                                   const miopenTensorDescriptor_t dinputDesc,
+                                   const Tgpu* input,
+                                   const Tgpu* output,
+                                   const Tgpu* doutput,
+                                   Tcheck* dinput_host,
+                                   const int dim,
+                                   const bool exclusive,
+                                   const bool reverse)
+{
+    const auto input_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
+    const auto output_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc));
+    const auto doutput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(doutputDesc));
+    const auto dinput_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(dinputDesc));
+
+    const auto size     = miopen::deref(inputDesc).GetElementSize();
+    const int ndims     = miopen::deref(inputDesc).GetNumDims();
+    const auto true_dim = ((dim % ndims) + ndims) % ndims;
+    const auto dim_size = miopen::deref(inputDesc).GetLengths()[true_dim];
+
+    auto baseDesc = miopen::TensorDescriptor(miopenFloat, miopen::deref(inputDesc).GetLengths());
+    auto base_tv  = miopen::get_inner_expanded_tv<5>(baseDesc);
+    auto log_grad_positive        = std::vector<float>(baseDesc.GetElementSize());
+    auto log_grad_negative        = std::vector<float>(baseDesc.GetElementSize());
+    auto pos_reverse_logcumsumexp = std::vector<float>(baseDesc.GetElementSize());
+    auto neg_reverse_logcumsumexp = std::vector<float>(baseDesc.GetElementSize());
+
+    // InitLogGrad
+    par_ford(size)([&](int idx) {
+        auto tensor_layout = tensor_layout_t<5>(base_tv, idx);
+
+        auto doutput_v = static_cast<float>(doutput[doutput_tv.get_tensor_view_idx(tensor_layout)]);
+        auto output_v  = static_cast<float>(output[output_tv.get_tensor_view_idx(tensor_layout)]);
+
+        if(!reverse ? tensor_layout.layout[true_dim] < exclusive
+                    : tensor_layout.layout[true_dim] + exclusive >= dim_size)
+            log_grad_positive[idx] = log_grad_negative[idx] =
+                -std::numeric_limits<float>::infinity();
+        else
+        {
+            log_grad_positive[idx] = (doutput_v > 0 ? std::log(doutput_v) - output_v
+                                                    : -std::numeric_limits<float>::infinity());
+            log_grad_negative[idx] = (doutput_v < 0 ? std::log(-doutput_v) - output_v
+                                                    : -std::numeric_limits<float>::infinity());
+        }
+    });
+
+    par_ford(2)([&](int T) {
+        if(T == 0)
+        {
+            // LogCumSumExpForward pos_reverse_logcumsumexp
+            mloLogCumSumExpForwardRunHost(/*inputDesc=*/&baseDesc,
+                                          /*outputDesc=*/&baseDesc,
+                                          /*input=*/log_grad_positive.data(),
+                                          /*output_host=*/pos_reverse_logcumsumexp.data(),
+                                          dim,
+                                          /*exclusive=*/false,
+                                          reverse);
+        }
+        if(T == 1)
+        {
+            // LogCumSumExpForward neg_reverse_logcumsumexp
+            mloLogCumSumExpForwardRunHost(/*inputDesc=*/&baseDesc,
+                                          /*outputDesc=*/&baseDesc,
+                                          /*input=*/log_grad_negative.data(),
+                                          /*output_host=*/neg_reverse_logcumsumexp.data(),
+                                          dim,
+                                          /*exclusive=*/false,
+                                          reverse);
+        }
+    });
+
+    // LogCumSumExpBackwardStep2
+    par_ford(size)([&](int idx) {
+        auto tensor_layout = tensor_layout_t<5>(base_tv, idx);
+
+        if(reverse ? tensor_layout.layout[true_dim] < exclusive
+                   : tensor_layout.layout[true_dim] + exclusive >= dim_size)
+        {
+            dinput_host[dinput_tv.get_tensor_view_idx(tensor_layout)] = static_cast<Tcheck>(0.0f);
+            return;
+        }
+        else
+            idx += (!reverse ? exclusive : -exclusive) * base_tv.stride[true_dim];
+
+        auto input_v = static_cast<float>(input[input_tv.get_tensor_view_idx(tensor_layout)]);
+
+        auto output_pos = std::exp(pos_reverse_logcumsumexp[idx] + input_v);
+        auto output_neg = std::exp(neg_reverse_logcumsumexp[idx] + input_v);
+
+        dinput_host[dinput_tv.get_tensor_view_idx(tensor_layout)] =
+            static_cast<Tcheck>(output_pos - output_neg);
+    });
+
+    return miopenStatusSuccess;
+}
@@ -72,6 +72,7 @@
  * @defgroup ReduceCalculation
  * @defgroup RotaryPositionalEmbeddings
  * @defgroup ReLU
+ * @defgroup LogCumSumExp
  *
  */
 
@@ -8226,6 +8227,67 @@ MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle,
 // CLOSEOUT LossFunction DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
+#ifdef MIOPEN_BETA_API
+// LogCumSumExp APIs
+/** @addtogroup LogCumSumExp
+ *
+ *  @{
+ */
+
+/*! @brief Execute a LogCumSumExp forward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Data tensor input (input)
+ * @param outputDesc               Tensor descriptor for output tensor (input)
+ * @param output                   Data tensor output (output)
+ * @param dim                      The dimension to do the operation over (input)
+ * @param exclusive                Exclusive operation (input)
+ * @param reverse                  Reverse the operation order (input)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenLogCumSumExpForward(miopenHandle_t handle,
+                                                       const miopenTensorDescriptor_t inputDesc,
+                                                       const void* input,
+                                                       const miopenTensorDescriptor_t outputDesc,
+                                                       void* output,
+                                                       const int dim,
+                                                       const bool exclusive,
+                                                       const bool reverse);
+
+/*! @brief Execute a LogCumSumExp backward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Data tensor input (input)
+ * @param outputDesc               Tensor descriptor for output tensor (input)
+ * @param output                   Data tensor output (input)
+ * @param doutputDesc              Tensor descriptor for output tensor gradient (input)
+ * @param doutput                  Data tensor output gradient (input)
+ * @param dinputDesc               Tensor descriptor for input gradient tensor (input)
+ * @param dinput                   Data tensor input gradient (output)
+ * @param dim                      The dimension to do the operation over (input)
+ * @param exclusive                Exclusive operation (input)
+ * @param reverse                  Reverse the operation order (input)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenLogCumSumExpBackward(miopenHandle_t handle,
+                                                        const miopenTensorDescriptor_t inputDesc,
+                                                        const void* input,
+                                                        const miopenTensorDescriptor_t outputDesc,
+                                                        const void* output,
+                                                        const miopenTensorDescriptor_t doutputDesc,
+                                                        const void* doutput,
+                                                        const miopenTensorDescriptor_t dinputDesc,
+                                                        void* dinput,
+                                                        const int dim,
+                                                        const bool exclusive,
+                                                        const bool reverse);
+
+/** @} */
+// CLOSEOUT LogCumSumExp DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
 #ifdef __cplusplus
 }
 #endif

@@ -158,6 +158,8 @@ set( MIOpen_Source
     layernorm/problem_description.cpp
     load_file.cpp
     lock_file.cpp
+    logcumsumexp/problem_description.cpp
+    logcumsumexp_api.cpp
     logger.cpp
     lrn_api.cpp
     mha/mha_descriptor.cpp
@@ -310,6 +312,10 @@ set( MIOpen_Source
     solver/layernorm/forward_layernorm2d_ck.cpp
     solver/layernorm/forward_layernorm4d_ck.cpp
     solver/layernorm/forward_t5layernorm.cpp
+    solver/logcumsumexp/backward_logcumsumexp_contiguous_smallcumdim_stride1.cpp
+    solver/logcumsumexp/backward_logcumsumexp_smallcumdim.cpp
+    solver/logcumsumexp/forward_logcumsumexp_contiguous_smallcumdim_stride1.cpp
+    solver/logcumsumexp/forward_logcumsumexp_smallcumdim.cpp
     solver/mha/mha_ck_fa_v2_solver_forward.cpp
     solver/mha/mha_solver_backward.cpp
     solver/mha/mha_solver_forward.cpp
@@ -543,6 +549,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenGetitem.cpp
         kernels/MIOpenKthvalue.cpp
         kernels/MIOpenLayerNorm.cpp
+        kernels/MIOpenLogCumSumExp.cpp
         kernels/MIOpenLRNBwd.cl
         kernels/MIOpenLRNFwd.cl
         kernels/MIOpenMultiMarginLoss.cpp
@@ -683,6 +690,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernel_cache.cpp
         kthvalue.cpp
         layernorm.cpp
+        logcumsumexp.cpp
         lrn.cpp
         mlo_dir_conv.cpp
         multimarginloss.cpp

@@ -0,0 +1,61 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/common.hpp>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+namespace logcumsumexp {
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t LogCumSumExpForward(Handle& handle,
+                                                           const TensorDescriptor& inputDesc,
+                                                           ConstData_t input,
+                                                           const TensorDescriptor& outputDesc,
+                                                           Data_t output,
+                                                           int dim,
+                                                           bool exclusive,
+                                                           bool reverse);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t LogCumSumExpBackward(Handle& handle,
+                                                            const TensorDescriptor& inputDesc,
+                                                            ConstData_t input,
+                                                            const TensorDescriptor& outputDesc,
+                                                            ConstData_t output,
+                                                            const TensorDescriptor& doutputDesc,
+                                                            ConstData_t doutput,
+                                                            const TensorDescriptor& dinputDesc,
+                                                            Data_t dinput,
+                                                            int dim,
+                                                            bool exclusive,
+                                                            bool reverse);
+
+} // namespace logcumsumexp
+} // namespace miopen
@@ -0,0 +1,73 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/common.hpp>
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+namespace logcumsumexp {
+
+struct InvokeParams : public miopen::InvokeParams
+{
+    InvokeParams() = default;
+
+    const TensorDescriptor* inputDesc  = nullptr;
+    const TensorDescriptor* outputDesc = nullptr;
+
+    ConstData_t input = nullptr;
+
+    int dim        = 0;
+    bool exclusive = false;
+    bool reverse   = false;
+
+    std::size_t GetWorkspaceSize() const { return 0; }
+    Data_t GetWorkspace() const { return nullptr; }
+};
+
+struct InvokeParamsForward : public InvokeParams
+{
+    InvokeParamsForward() = default;
+
+    Data_t output = nullptr;
+};
+
+struct InvokeParamsBackward : public InvokeParams
+{
+    InvokeParamsBackward() = default;
+
+    const TensorDescriptor* dinputDesc  = nullptr;
+    const TensorDescriptor* doutputDesc = nullptr;
+
+    ConstData_t output  = nullptr;
+    ConstData_t doutput = nullptr;
+    Data_t dinput       = nullptr;
+};
+
+} // namespace logcumsumexp
+} // namespace miopen
@@ -0,0 +1,88 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace logcumsumexp {
+
+struct ForwardProblemDescription : ProblemDescriptionBase
+{
+    ForwardProblemDescription(const TensorDescriptor& inputDesc_,
+                              const TensorDescriptor& outputDesc_,
+                              int dim_);
+
+    const TensorDescriptor& GetInputDesc() const { return inputDesc; }
+    const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
+    const int& GetDim() const { return dim; }
+
+    bool IsValidDim() const;
+    bool IsSameLength() const;
+    bool IsSameType() const;
+    bool IsSameStride() const;
+    bool IsAllPacked() const;
+    bool IsAllDimStride1() const;
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+protected:
+    TensorDescriptor inputDesc;
+    TensorDescriptor outputDesc;
+    int dim;
+};
+
+struct BackwardProblemDescription : ForwardProblemDescription
+{
+    BackwardProblemDescription(const TensorDescriptor& inputDesc_,
+                               const TensorDescriptor& outputDesc_,
+                               const TensorDescriptor& doutputDesc_,
+                               const TensorDescriptor& dinputDesc_,
+                               const int& dim_);
+
+    const TensorDescriptor& GetDInputDesc() const { return dinputDesc; }
+    const TensorDescriptor& GetDOutputDesc() const { return doutputDesc; }
+
+    bool IsSameLength() const;
+    bool IsSameType() const;
+    bool IsSameStride() const;
+    bool IsAllPacked() const;
+    bool IsAllDimStride1() const;
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+private:
+    TensorDescriptor doutputDesc;
+    TensorDescriptor dinputDesc;
+};
+
+} // namespace logcumsumexp
+} // namespace miopen
@@ -0,0 +1,98 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/logcumsumexp/problem_description.hpp>
+#include <miopen/solver.hpp>
+
+namespace miopen {
+namespace solver {
+namespace logcumsumexp {
+
+using ForwardSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::logcumsumexp::ForwardProblemDescription>;
+
+struct ForwardContiguousSmallCumDimStride1 final : ForwardSolverBase
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<ForwardContiguousSmallCumDimStride1>();
+    }
+
+    bool
+    IsApplicable(const ExecutionContext& context,
+                 const miopen::logcumsumexp::ForwardProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::logcumsumexp::ForwardProblemDescription& problem) const override;
+};
+
+struct ForwardSmallCumDim final : ForwardSolverBase
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<ForwardSmallCumDim>(); }
+
+    bool
+    IsApplicable(const ExecutionContext& context,
+                 const miopen::logcumsumexp::ForwardProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::logcumsumexp::ForwardProblemDescription& problem) const override;
+};
+
+using BackwardSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::logcumsumexp::BackwardProblemDescription>;
+
+struct BackwardContiguousSmallCumDimStride1 final : BackwardSolverBase
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<BackwardContiguousSmallCumDimStride1>();
+    }
+
+    bool
+    IsApplicable(const ExecutionContext& context,
+                 const miopen::logcumsumexp::BackwardProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::logcumsumexp::BackwardProblemDescription& problem) const override;
+};
+
+struct BackwardSmallCumDim final : BackwardSolverBase
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<BackwardSmallCumDim>(); }
+
+    bool
+    IsApplicable(const ExecutionContext& context,
+                 const miopen::logcumsumexp::BackwardProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::logcumsumexp::BackwardProblemDescription& problem) const override;
+};
+
+} // namespace logcumsumexp
+} // namespace solver
+} // namespace miopen
@@ -27,9 +27,16 @@
 #ifndef MIOPEN_TENSOR_VIEW_UTIL_HPP_
 #define MIOPEN_TENSOR_VIEW_UTIL_HPP_
 
-#include "../../kernels/tensor_view.hpp"
+#include <miopen/common.hpp>
+#include <miopen/errors.hpp>
 #include <miopen/tensor.hpp>
 
+#include "../../kernels/tensor_view.hpp"
+
+#include <algorithm>
+#include <sstream>
+#include <vector>
+
 namespace miopen {
 
 template <int N>
@@ -102,6 +109,40 @@ inline tensor_view_t<N - 1> get_tv_without_dim(const tensor_view_t<N>& origin_tv
     return res;
 }
 
+template <int N, typename T>
+inline void permute_tv(tensor_view_t<N>& tensor_view, std::vector<T> permute)
+{
+    // Validate permutation
+    {
+        MIOPEN_THROW_IF(
+            permute.size() != N,
+            (std::stringstream() << "Tensor view permute: Permutation size must be " << N).str());
+        std::vector<bool> exist(N, false);
+        for(auto idx : permute)
+        {
+            MIOPEN_THROW_IF(idx < 0 || N <= idx,
+                            (std::stringstream()
+                             << "Tensor view permute: Permutation value must be in range [" << 0
+                             << "," << N - 1 << "], while it is " << idx)
+                                .str());
+            MIOPEN_THROW_IF(exist[idx],
+                            (std::stringstream()
+                             << "Tensor view permute: Permutation value " << idx << " duplicate.")
+                                .str());
+            exist[idx] = true;
+        }
+    }
+
+    uint64_t new_stride[N], new_size[N];
+    for(auto i = 0; i < N; ++i)
+    {
+        new_stride[i] = tensor_view.stride[permute[i]];
+        new_size[i]   = tensor_view.size[permute[i]];
+    }
+    std::copy(new_stride, new_stride + N, tensor_view.stride);
+    std::copy(new_size, new_size + N, tensor_view.size);
+}
+
 } // namespace miopen
 
 #endif // MIOPEN_TENSOR_VIEW_UTIL_HPP_
@@ -36,7 +36,7 @@ template <int N>
 struct tensor_view_t
 {
     // Get index in tensor view at tensor layout
-    constexpr uint64_t get_tensor_view_idx(const tensor_layout_t<N>& tensor_layout)
+    constexpr uint64_t get_tensor_view_idx(const tensor_layout_t<N>& tensor_layout) const
     {
         static_assert(N > 0);
         uint64_t idx = 0;
@@ -53,6 +53,8 @@ struct tensor_view_t
 template <int N>
 struct tensor_layout_t
 {
+    tensor_layout_t() = default;
+
     // Make tensor layout at index using tensor view
     constexpr tensor_layout_t(const tensor_view_t<N>& tensor_view, uint64_t idx)
     {

@@ -0,0 +1,118 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/find_solution.hpp>
+#include <miopen/logcumsumexp.hpp>
+#include <miopen/logcumsumexp/invoke_params.hpp>
+#include <miopen/logcumsumexp/solvers.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+namespace logcumsumexp {
+
+miopenStatus_t LogCumSumExpForward(Handle& handle,
+                                   const TensorDescriptor& inputDesc,
+                                   ConstData_t input,
+                                   const TensorDescriptor& outputDesc,
+                                   Data_t output,
+                                   const int dim,
+                                   const bool exclusive,
+                                   const bool reverse)
+{
+    const auto problem = logcumsumexp::ForwardProblemDescription{inputDesc, outputDesc, dim};
+
+    const auto invoke_params = [&]() {
+        auto tmp       = logcumsumexp::InvokeParamsForward{};
+        tmp.type       = InvokeType::Run;
+        tmp.inputDesc  = &inputDesc;
+        tmp.outputDesc = &outputDesc;
+        tmp.input      = input;
+        tmp.output     = output;
+
+        tmp.dim       = dim;
+        tmp.exclusive = exclusive;
+        tmp.reverse   = reverse;
+
+        return tmp;
+    }();
+
+    const auto algo = AlgorithmName{"LogCumSumExpForward"};
+    const auto solvers =
+        solver::SolverContainer<solver::logcumsumexp::ForwardContiguousSmallCumDimStride1,
+                                solver::logcumsumexp::ForwardSmallCumDim>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+miopenStatus_t LogCumSumExpBackward(Handle& handle,
+                                    const TensorDescriptor& inputDesc,
+                                    ConstData_t input,
+                                    const TensorDescriptor& outputDesc,
+                                    ConstData_t output,
+                                    const TensorDescriptor& doutputDesc,
+                                    ConstData_t doutput,
+                                    const TensorDescriptor& dinputDesc,
+                                    Data_t dinput,
+                                    const int dim,
+                                    const bool exclusive,
+                                    const bool reverse)
+{
+    const auto problem = logcumsumexp::BackwardProblemDescription{
+        inputDesc, outputDesc, doutputDesc, dinputDesc, dim};
+
+    const auto invoke_params = [&]() {
+        auto tmp        = logcumsumexp::InvokeParamsBackward{};
+        tmp.type        = InvokeType::Run;
+        tmp.inputDesc   = &inputDesc;
+        tmp.outputDesc  = &outputDesc;
+        tmp.doutputDesc = &doutputDesc;
+        tmp.dinputDesc  = &dinputDesc;
+        tmp.input       = input;
+        tmp.output      = output;
+        tmp.doutput     = doutput;
+        tmp.dinput      = dinput;
+
+        tmp.dim       = dim;
+        tmp.exclusive = exclusive;
+        tmp.reverse   = reverse;
+
+        return tmp;
+    }();
+
+    const auto algo = AlgorithmName{"LogCumSumExpBackward"};
+    const auto solvers =
+        solver::SolverContainer<solver::logcumsumexp::BackwardContiguousSmallCumDimStride1,
+                                solver::logcumsumexp::BackwardSmallCumDim>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace logcumsumexp
+} // namespace miopen
@@ -0,0 +1,213 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/logcumsumexp/problem_description.hpp>
+#include <miopen/names.hpp>
+
+#include <sstream>
+
+namespace miopen {
+namespace logcumsumexp {
+
+ForwardProblemDescription::ForwardProblemDescription(const TensorDescriptor& inputDesc_,
+                                                     const TensorDescriptor& outputDesc_,
+                                                     const int dim_)
+    : inputDesc(inputDesc_), outputDesc(outputDesc_), dim(dim_)
+{
+    if(IsValidDim())
+        dim = (dim < 0 ? dim + inputDesc.GetNumDims() : dim);
+    IsSameLength();
+    IsSameType();
+}
+
+bool ForwardProblemDescription::IsValidDim() const
+{
+    const int ndims = inputDesc.GetNumDims();
+    if(dim < -ndims || ndims - 1 < dim)
+    {
+        MIOPEN_THROW(miopenStatusBadParm,
+                     (std::stringstream() << "LogCumSumExp: Operating dim value must be in range ["
+                                          << -ndims << "," << ndims - 1 << "].")
+                         .str());
+    }
+    return true;
+}
+
+bool ForwardProblemDescription::IsSameLength() const
+{
+    if(inputDesc.GetLengths() != outputDesc.GetLengths())
+        MIOPEN_THROW(miopenStatusBadParm,
+                     "LogCumSumExp: Input and Output tensor sizes do not match.");
+    return true;
+}
+
+bool ForwardProblemDescription::IsSameType() const
+{
+    if(inputDesc.GetType() != outputDesc.GetType())
+        MIOPEN_THROW(miopenStatusBadParm,
+                     "LogCumSumExp: Input and Output tensor type do not match.");
+    return true;
+}
+
+bool ForwardProblemDescription::IsSameStride() const
+{
+    if(inputDesc.GetStrides() != outputDesc.GetStrides())
+        return false;
+    return true;
+}
+
+bool ForwardProblemDescription::IsAllPacked() const
+{
+    if(!inputDesc.IsPacked() || !outputDesc.IsPacked())
+        return false;
+    return true;
+}
+
+bool ForwardProblemDescription::IsAllDimStride1() const
+{
+    if(inputDesc.GetStrides()[dim] != 1)
+        return false;
+    if(outputDesc.GetStrides()[dim] != 1)
+        return false;
+    return true;
+}
+
+BackwardProblemDescription::BackwardProblemDescription(const TensorDescriptor& inputDesc_,
+                                                       const TensorDescriptor& outputDesc_,
+                                                       const TensorDescriptor& doutputDesc_,
+                                                       const TensorDescriptor& dinputDesc_,
+                                                       const int& dim_)
+    : ForwardProblemDescription(inputDesc_, outputDesc_, dim_),
+      doutputDesc(doutputDesc_),
+      dinputDesc(dinputDesc_)
+{
+    IsSameLength();
+    IsSameType();
+}
+
+bool BackwardProblemDescription::IsSameLength() const
+{
+    if(!ForwardProblemDescription::IsSameLength())
+        return false;
+    if(inputDesc.GetLengths() != dinputDesc.GetLengths())
+        MIOPEN_THROW(miopenStatusBadParm,
+                     "LogCumSumExp: Input and its Gradient tensor sizes do not match.");
+    if(outputDesc.GetLengths() != doutputDesc.GetLengths())
+        MIOPEN_THROW(miopenStatusBadParm,
+                     "LogCumSumExp: Output and its Gradient tensor sizes do not match.");
+    return true;
+}
+
+bool BackwardProblemDescription::IsSameType() const
+{
+    if(!ForwardProblemDescription::IsSameType())
+        return false;
+    if(inputDesc.GetType() != dinputDesc.GetType())
+        MIOPEN_THROW(miopenStatusBadParm,
+                     "LogCumSumExp: Input and its Gradient tensor type do not match.");
+    if(outputDesc.GetType() != doutputDesc.GetType())
+        MIOPEN_THROW(miopenStatusBadParm,
+                     "LogCumSumExp: Output and its Gradient tensor type do not match.");
+    return true;
+}
+
+bool BackwardProblemDescription::IsSameStride() const
+{
+    if(!ForwardProblemDescription::IsSameStride())
+        return false;
+    if(inputDesc.GetStrides() != dinputDesc.GetStrides())
+        return false;
+    if(outputDesc.GetStrides() != doutputDesc.GetStrides())
+        return false;
+    return true;
+}
+
+bool BackwardProblemDescription::IsAllPacked() const
+{
+    if(!ForwardProblemDescription::IsAllPacked())
+        return false;
+    if(!dinputDesc.IsPacked() || !doutputDesc.IsPacked())
+        return false;
+    return true;
+}
+
+bool BackwardProblemDescription::IsAllDimStride1() const
+{
+    if(!ForwardProblemDescription::IsAllDimStride1())
+        return false;
+    if(dinputDesc.GetStrides()[dim] != 1)
+        return false;
+    if(doutputDesc.GetStrides()[dim] != 1)
+        return false;
+    return true;
+}
+
+NetworkConfig ForwardProblemDescription::MakeNetworkConfig() const
+{
+    auto dtype      = inputDesc.GetType();
+    auto size       = inputDesc.GetElementSize();
+    auto ndims      = inputDesc.GetNumDims();
+    auto inner_size = inputDesc.GetLengths()[dim];
+    auto outer_size = size / inner_size;
+
+    std::ostringstream ss;
+
+    ss << "logcumsumexp_fwd";
+    ss << "dtype" << dtype;
+    ss << "outer" << outer_size;
+    ss << "inner" << inner_size;
+    ss << "ndims" << ndims;
+    ss << "sameStride" << IsSameStride();
+    ss << "packed" << IsAllPacked();
+    ss << "dimstride1" << IsAllDimStride1();
+
+    return NetworkConfig{ss.str()};
+}
+
+NetworkConfig BackwardProblemDescription::MakeNetworkConfig() const
+{
+    auto dtype      = inputDesc.GetType();
+    auto size       = inputDesc.GetElementSize();
+    auto ndims      = inputDesc.GetNumDims();
+    auto inner_size = inputDesc.GetLengths()[dim];
+    auto outer_size = size / inner_size;
+
+    std::ostringstream ss;
+
+    ss << "logcumsumexp_bwd";
+    ss << "dtype" << dtype;
+    ss << "outer" << outer_size;
+    ss << "inner" << inner_size;
+    ss << "ndims" << ndims;
+    ss << "sameStride" << IsSameStride();
+    ss << "packed" << IsAllPacked();
+    ss << "dimstride1" << IsAllDimStride1();
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace logcumsumexp
+} // namespace miopen
@@ -0,0 +1,140 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/handle.hpp>
+#include <miopen/logcumsumexp.hpp>
+#include <miopen/tensor_ops.hpp>
+
+static void LogCmdLogCumSumExp(const miopenTensorDescriptor_t inputDesc,
+                               const miopenTensorDescriptor_t outputDesc,
+                               const miopenTensorDescriptor_t doutputDesc,
+                               const miopenTensorDescriptor_t dinputDesc,
+                               const int dim,
+                               const bool exclusive,
+                               const bool reverse,
+                               const bool is_fwd)
+{
+    if(miopen::IsLoggingCmd())
+    {
+        std::stringstream ss;
+        auto dtype = miopen::deref(inputDesc).GetType();
+        if(dtype == miopenHalf)
+        {
+            ss << "logcumsumexpfp16";
+        }
+        else if(dtype == miopenFloat)
+        {
+            ss << "logcumsumexpfp32";
+        }
+        else if(dtype == miopenBFloat16)
+        {
+            ss << "logcumsumexpbfp16";
+        }
+
+        if(is_fwd)
+        {
+            MIOPEN_LOG_FUNCTION(inputDesc, outputDesc);
+        }
+        else
+        {
+            MIOPEN_LOG_FUNCTION(inputDesc, outputDesc, doutputDesc, dinputDesc);
+        }
+        ss << " -d " << dim;
+        ss << " --excl " << exclusive;
+        ss << " --rev " << reverse;
+        ss << " -F " << ((is_fwd) ? "1" : "2");
+
+        MIOPEN_LOG_DRIVER_CMD(ss.str());
+    }
+}
+
+extern "C" miopenStatus_t miopenLogCumSumExpForward(miopenHandle_t handle,
+                                                    const miopenTensorDescriptor_t inputDesc,
+                                                    const void* input,
+                                                    const miopenTensorDescriptor_t outputDesc,
+                                                    void* output,
+                                                    const int dim,
+                                                    const bool exclusive,
+                                                    const bool reverse)
+{
+    MIOPEN_LOG_FUNCTION(handle, inputDesc, input, outputDesc, output, dim, exclusive, reverse);
+
+    LogCmdLogCumSumExp(inputDesc, outputDesc, nullptr, nullptr, dim, exclusive, reverse, true);
+    return miopen::try_([&] {
+        miopen::logcumsumexp::LogCumSumExpForward(miopen::deref(handle),
+                                                  miopen::deref(inputDesc),
+                                                  DataCast(input),
+                                                  miopen::deref(outputDesc),
+                                                  DataCast(output),
+                                                  dim,
+                                                  exclusive,
+                                                  reverse);
+    });
+}
+
+extern "C" miopenStatus_t miopenLogCumSumExpBackward(miopenHandle_t handle,
+                                                     const miopenTensorDescriptor_t inputDesc,
+                                                     const void* input,
+                                                     const miopenTensorDescriptor_t outputDesc,
+                                                     const void* output,
+                                                     const miopenTensorDescriptor_t doutputDesc,
+                                                     const void* doutput,
+                                                     const miopenTensorDescriptor_t dinputDesc,
+                                                     void* dinput,
+                                                     const int dim,
+                                                     const bool exclusive,
+                                                     const bool reverse)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        inputDesc,
+                        input,
+                        outputDesc,
+                        output,
+                        doutputDesc,
+                        doutput,
+                        dinputDesc,
+                        dinput,
+                        dim,
+                        exclusive,
+                        reverse);
+
+    LogCmdLogCumSumExp(
+        inputDesc, outputDesc, doutputDesc, dinputDesc, dim, exclusive, reverse, false);
+    return miopen::try_([&] {
+        miopen::logcumsumexp::LogCumSumExpBackward(miopen::deref(handle),
+                                                   miopen::deref(inputDesc),
+                                                   DataCast(input),
+                                                   miopen::deref(outputDesc),
+                                                   DataCast(output),
+                                                   miopen::deref(doutputDesc),
+                                                   DataCast(doutput),
+                                                   miopen::deref(dinputDesc),
+                                                   DataCast(dinput),
+                                                   dim,
+                                                   exclusive,
+                                                   reverse);
+    });
+}
@@ -35,6 +35,7 @@
 #include <miopen/getitem/solvers.hpp>
 #include <miopen/kthvalue/solvers.hpp>
 #include <miopen/layernorm/solvers.hpp>
+#include <miopen/logcumsumexp/solvers.hpp>
 #include <miopen/pooling/solvers.hpp>
 #include <miopen/prelu/solvers.hpp>
 #include <miopen/reduce/solvers.hpp>
@@ -707,6 +708,17 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
              multimarginloss::MultiMarginLossForward{}.SolverDbId());
 
     Register(registry, ++id, Primitive::Mha, mha::MhaCKFlashAttentionV2Forward{}.SolverDbId());
+
+    Register(registry,
+             ++id,
+             Primitive::Reduce,
+             logcumsumexp::ForwardContiguousSmallCumDimStride1{}.SolverDbId());
+    Register(registry, ++id, Primitive::Reduce, logcumsumexp::ForwardSmallCumDim{}.SolverDbId());
+    Register(registry,
+             ++id,
+             Primitive::Reduce,
+             logcumsumexp::BackwardContiguousSmallCumDimStride1{}.SolverDbId());
+    Register(registry, ++id, Primitive::Reduce, logcumsumexp::BackwardSmallCumDim{}.SolverDbId());
     // IMPORTANT: New solvers should be added to the end of the function, and don't leave a white
     // space between this comment and the newly registered solver(s)!
 }

@@ -0,0 +1,119 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/logcumsumexp/invoke_params.hpp>
+#include <miopen/logcumsumexp/solvers.hpp>
+#include <miopen/mlo_internal.hpp>
+
+#define warpSizeCTX (context.GetStream().GetWavefrontWidth())
+#define LOCAL_SIZE_MAX 1024
+#define LOCAL_SIZE_MIN warpSizeCTX
+
+namespace miopen {
+namespace solver {
+namespace logcumsumexp {
+
+bool BackwardContiguousSmallCumDimStride1::IsApplicable(
+    const ExecutionContext& /*context*/,
+    const miopen::logcumsumexp::BackwardProblemDescription& problem) const
+{
+    if(problem.GetInputDesc().GetLengths()[problem.GetDim()] > LOCAL_SIZE_MAX)
+        return false;
+    if(!problem.IsAllPacked())
+        return false;
+    if(!problem.IsAllDimStride1())
+        return false;
+    if(!problem.IsSameStride())
+        return false;
+    if(!(problem.GetInputDesc().GetType() == miopenFloat ||
+         problem.GetInputDesc().GetType() == miopenHalf ||
+         problem.GetInputDesc().GetType() == miopenBFloat16))
+        return false;
+    return true;
+}
+
+ConvSolution BackwardContiguousSmallCumDimStride1::GetSolution(
+    const ExecutionContext& context,
+    const miopen::logcumsumexp::BackwardProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto dtype = problem.GetInputDesc().GetType();
+
+    auto size       = problem.GetInputDesc().GetElementSize();
+    auto inner_size = problem.GetInputDesc().GetLengths()[problem.GetDim()];
+    auto outer_size = size / inner_size;
+
+    // LOCAL_SIZE must be the smallest power of 2 that greater than inner_size and warpSize
+    auto local_size = LOCAL_SIZE_MIN;
+    while(local_size < inner_size)
+        local_size *= 2;
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"REDUCE_SIZE", local_size},
+    };
+
+    {
+        result.construction_params.push_back(KernelInfo{
+            build_params.GenerateFor(kbp::HIP{}),
+            {1, local_size},
+            {outer_size, AlignUp(inner_size, local_size)},
+            "MIOpenLogCumSumExp.cpp",
+            "LogCumSumExpBackwardContiguousSmallCumDimStride1",
+        });
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            auto params = raw_params.CastTo<miopen::logcumsumexp::InvokeParamsBackward>();
+
+            const int ndims             = deref(params.inputDesc).GetNumDims();
+            const unsigned int true_dim = ((params.dim % ndims) + ndims) % ndims;
+            auto kernel                 = handle_.Run(kernels[0]);
+            kernel(params.input,
+                   params.output,
+                   params.doutput,
+                   params.dinput,
+                   deref(params.inputDesc).GetLengths()[true_dim],
+                   params.exclusive,
+                   params.reverse);
+        };
+    };
+
+    return result;
+}
+
+} // namespace logcumsumexp
+} // namespace solver
+} // namespace miopen
@@ -0,0 +1,148 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/logcumsumexp/invoke_params.hpp>
+#include <miopen/logcumsumexp/solvers.hpp>
+#include <miopen/mlo_internal.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define warpSizeCTX (context.GetStream().GetWavefrontWidth())
+#define LOCAL_SIZE_MAX 1024
+#define LOCAL_SIZE_MIN warpSizeCTX
+
+#define VIEW_DIMS 5
+
+namespace miopen {
+namespace solver {
+namespace logcumsumexp {
+
+namespace {
+bool IsImprovementOverROCm(const ExecutionContext& /*context*/,
+                           const miopen::logcumsumexp::BackwardProblemDescription& problem)
+{
+    if(!problem.IsAllDimStride1())
+        return false;
+    return true;
+}
+} // namespace
+
+bool BackwardSmallCumDim::IsApplicable(
+    const ExecutionContext& context,
+    const miopen::logcumsumexp::BackwardProblemDescription& problem) const
+{
+    if(!IsImprovementOverROCm(context, problem))
+        return false;
+    if(problem.GetInputDesc().GetLengths()[problem.GetDim()] > LOCAL_SIZE_MAX)
+        return false;
+    if(problem.GetInputDesc().GetNumDims() > VIEW_DIMS)
+        return false;
+    if(!(problem.GetInputDesc().GetType() == miopenFloat ||
+         problem.GetInputDesc().GetType() == miopenHalf ||
+         problem.GetInputDesc().GetType() == miopenBFloat16))
+        return false;
+    return true;
+}
+
+ConvSolution BackwardSmallCumDim::GetSolution(
+    const ExecutionContext& context,
+    const miopen::logcumsumexp::BackwardProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto dtype = problem.GetInputDesc().GetType();
+
+    auto size       = problem.GetInputDesc().GetElementSize();
+    auto inner_size = problem.GetInputDesc().GetLengths()[problem.GetDim()];
+    auto outer_size = size / inner_size;
+
+    // LOCAL_SIZE must be the smallest power of 2 that greater than inner_size and warpSize
+    auto local_size = LOCAL_SIZE_MIN;
+    while(local_size < inner_size)
+        local_size *= 2;
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"REDUCE_SIZE", local_size},
+        {"VIEW_DIMS", VIEW_DIMS},
+    };
+
+    {
+        result.construction_params.push_back(KernelInfo{
+            build_params.GenerateFor(kbp::HIP{}),
+            {1, local_size},
+            {outer_size, AlignUp(inner_size, local_size)},
+            "MIOpenLogCumSumExp.cpp",
+            "LogCumSumExpBackwardSmallCumDim",
+        });
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            auto params = raw_params.CastTo<miopen::logcumsumexp::InvokeParamsBackward>();
+
+            const int ndims             = deref(params.inputDesc).GetNumDims();
+            const unsigned int true_dim = ((params.dim % ndims) + ndims) % ndims;
+            auto input_tv               = get_inner_expanded_tv<VIEW_DIMS>(deref(params.inputDesc));
+            auto output_tv  = get_inner_expanded_tv<VIEW_DIMS>(deref(params.outputDesc));
+            auto doutput_tv = get_inner_expanded_tv<VIEW_DIMS>(deref(params.doutputDesc));
+            auto dinput_tv  = get_inner_expanded_tv<VIEW_DIMS>(deref(params.dinputDesc));
+
+            std::vector<int> permute(VIEW_DIMS);
+            std::iota(permute.begin(), permute.end(), 0);
+            std::rotate(permute.begin() + true_dim, permute.begin() + true_dim + 1, permute.end());
+
+            permute_tv(input_tv, permute);
+            permute_tv(output_tv, permute);
+            permute_tv(doutput_tv, permute);
+            permute_tv(dinput_tv, permute);
+
+            auto kernel = handle_.Run(kernels[0]);
+            kernel(params.input,
+                   params.output,
+                   params.doutput,
+                   params.dinput,
+                   params.exclusive,
+                   params.reverse,
+                   input_tv,
+                   output_tv,
+                   doutput_tv,
+                   dinput_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace logcumsumexp
+} // namespace solver
+} // namespace miopen
@@ -0,0 +1,119 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/logcumsumexp/invoke_params.hpp>
+#include <miopen/logcumsumexp/solvers.hpp>
+#include <miopen/mlo_internal.hpp>
+
+#define warpSizeCTX (context.GetStream().GetWavefrontWidth())
+#define LOCAL_SIZE_MAX 1024
+#define LOCAL_SIZE_MIN warpSizeCTX
+
+namespace miopen {
+namespace solver {
+namespace logcumsumexp {
+
+bool ForwardContiguousSmallCumDimStride1::IsApplicable(
+    const ExecutionContext& /*context*/,
+    const miopen::logcumsumexp::ForwardProblemDescription& problem) const
+{
+    if(problem.GetInputDesc().GetLengths()[problem.GetDim()] > LOCAL_SIZE_MAX)
+        return false;
+    if(!problem.IsAllPacked())
+        return false;
+    if(!problem.IsAllDimStride1())
+        return false;
+    if(!problem.IsSameStride())
+        return false;
+    if(!(problem.GetInputDesc().GetType() == miopenFloat ||
+         problem.GetInputDesc().GetType() == miopenHalf ||
+         problem.GetInputDesc().GetType() == miopenBFloat16))
+        return false;
+    return true;
+}
+
+ConvSolution ForwardContiguousSmallCumDimStride1::GetSolution(
+    const ExecutionContext& context,
+    const miopen::logcumsumexp::ForwardProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto dtype        = problem.GetInputDesc().GetType();
+    auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
+
+    auto size       = problem.GetInputDesc().GetElementSize();
+    auto inner_size = problem.GetInputDesc().GetLengths()[problem.GetDim()];
+    auto outer_size = size / inner_size;
+
+    // LOCAL_SIZE must be the smallest power of 2 that greater than inner_size and warpSize
+    auto local_size = LOCAL_SIZE_MIN;
+    while(local_size < inner_size)
+        local_size *= 2;
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"REDUCE_SIZE", local_size},
+    };
+
+    {
+        result.construction_params.push_back(KernelInfo{
+            build_params.GenerateFor(kbp::HIP{}),
+            {1, local_size},
+            {outer_size, AlignUp(inner_size, local_size)},
+            "MIOpenLogCumSumExp.cpp",
+            "LogCumSumExpForwardContiguousSmallCumDimStride1",
+        });
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            auto params = raw_params.CastTo<miopen::logcumsumexp::InvokeParamsForward>();
+
+            const int ndims             = deref(params.inputDesc).GetNumDims();
+            const unsigned int true_dim = ((params.dim % ndims) + ndims) % ndims;
+            auto kernel                 = handle_.Run(kernels[0]);
+            kernel(params.input,
+                   params.output,
+                   deref(params.inputDesc).GetLengths()[true_dim],
+                   params.exclusive,
+                   params.reverse);
+        };
+    };
+
+    return result;
+}
+
+} // namespace logcumsumexp
+} // namespace solver
+} // namespace miopen
@@ -0,0 +1,136 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/logcumsumexp/invoke_params.hpp>
+#include <miopen/logcumsumexp/solvers.hpp>
+#include <miopen/mlo_internal.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define warpSizeCTX (context.GetStream().GetWavefrontWidth())
+#define LOCAL_SIZE_MAX 1024
+#define LOCAL_SIZE_MIN warpSizeCTX
+
+#define VIEW_DIMS 5
+
+namespace miopen {
+namespace solver {
+namespace logcumsumexp {
+
+namespace {
+bool IsImprovementOverROCm(const ExecutionContext& /*context*/,
+                           const miopen::logcumsumexp::ForwardProblemDescription& problem)
+{
+    if(!problem.IsAllDimStride1())
+        return false;
+    return true;
+}
+} // namespace
+
+bool ForwardSmallCumDim::IsApplicable(
+    const ExecutionContext& context,
+    const miopen::logcumsumexp::ForwardProblemDescription& problem) const
+{
+    if(!IsImprovementOverROCm(context, problem))
+        return false;
+    if(problem.GetInputDesc().GetLengths()[problem.GetDim()] > LOCAL_SIZE_MAX)
+        return false;
+    if(problem.GetInputDesc().GetNumDims() > VIEW_DIMS)
+        return false;
+    if(!(problem.GetInputDesc().GetType() == miopenFloat ||
+         problem.GetInputDesc().GetType() == miopenHalf ||
+         problem.GetInputDesc().GetType() == miopenBFloat16))
+        return false;
+    return true;
+}
+
+ConvSolution ForwardSmallCumDim::GetSolution(
+    const ExecutionContext& context,
+    const miopen::logcumsumexp::ForwardProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto dtype        = problem.GetInputDesc().GetType();
+    auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
+
+    auto size       = problem.GetInputDesc().GetElementSize();
+    auto inner_size = problem.GetInputDesc().GetLengths()[problem.GetDim()];
+    auto outer_size = size / inner_size;
+
+    // LOCAL_SIZE must be the smallest power of 2 that greater than inner_size and warpSize
+    auto local_size = LOCAL_SIZE_MIN;
+    while(local_size < inner_size)
+        local_size *= 2;
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"REDUCE_SIZE", local_size},
+        {"VIEW_DIMS", VIEW_DIMS},
+    };
+
+    {
+        result.construction_params.push_back(KernelInfo{
+            build_params.GenerateFor(kbp::HIP{}),
+            {1, local_size},
+            {outer_size, AlignUp(inner_size, local_size)},
+            "MIOpenLogCumSumExp.cpp",
+            "LogCumSumExpForwardSmallCumDim",
+        });
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            auto params = raw_params.CastTo<miopen::logcumsumexp::InvokeParamsForward>();
+
+            const int ndims             = deref(params.inputDesc).GetNumDims();
+            const unsigned int true_dim = ((params.dim % ndims) + ndims) % ndims;
+            auto input_tv               = get_inner_expanded_tv<VIEW_DIMS>(deref(params.inputDesc));
+            auto output_tv = get_inner_expanded_tv<VIEW_DIMS>(deref(params.outputDesc));
+
+            std::vector<int> permute(VIEW_DIMS);
+            std::iota(permute.begin(), permute.end(), 0);
+            std::rotate(permute.begin() + true_dim, permute.begin() + true_dim + 1, permute.end());
+
+            permute_tv(input_tv, permute);
+            permute_tv(output_tv, permute);
+
+            auto kernel = handle_.Run(kernels[0]);
+            kernel(
+                params.input, params.output, params.exclusive, params.reverse, input_tv, output_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace logcumsumexp
+} // namespace solver
+} // namespace miopen
@@ -0,0 +1,152 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "tensor_holder.hpp"
+#include <miopen/tensor_view_utils.hpp>
+
+#include <limits>
+
+template <class T>
+void cpu_logcumsumexp_forward(const tensor<T> input,
+                              tensor<T>& ref_output,
+                              const int dim,
+                              const bool exclusive,
+                              const bool reverse)
+{
+    auto input_tv  = miopen::get_inner_expanded_tv<5>(input.desc);
+    auto output_tv = miopen::get_inner_expanded_tv<5>(ref_output.desc);
+
+    const int ndims     = input.desc.GetNumDims();
+    const auto true_dim = ((dim % ndims) + ndims) % ndims;
+
+    auto size       = input.desc.GetElementSize();
+    auto inner_size = input.desc.GetLengths()[true_dim];
+    auto outer_size = size / inner_size;
+
+    tensor_view_t<5> ignore_dim_input_tv = input_tv;
+    ignore_dim_input_tv.size[true_dim]   = 1;
+
+    par_ford(outer_size)([&](int gid) {
+        auto tensor_layout = tensor_layout_t<5>(ignore_dim_input_tv, gid);
+        float cumsum       = 0;
+
+        ford(inner_size)([&](int idx) {
+            int tmp_idx =
+                (reverse ? input_tv.size[true_dim] - (idx - exclusive) - 1 : (idx - exclusive));
+            float tmp_val = 0;
+            if(0 <= tmp_idx && tmp_idx < inner_size)
+            {
+                tensor_layout.layout[true_dim] = tmp_idx;
+                tmp_val                        = std::exp(
+                    static_cast<float>(input[input_tv.get_tensor_view_idx(tensor_layout)]));
+            }
+
+            cumsum += tmp_val;
+
+            tensor_layout.layout[true_dim] = (reverse ? input_tv.size[true_dim] - idx - 1 : idx);
+            ref_output[output_tv.get_tensor_view_idx(tensor_layout)] =
+                static_cast<T>(std::log(cumsum));
+        });
+    });
+}
+
+template <class T>
+void cpu_logcumsumexp_backward(const tensor<T> input,
+                               const tensor<T> output,
+                               const tensor<T> doutput,
+                               tensor<T>& ref_dinput,
+                               const int dim,
+                               const bool exclusive,
+                               const bool reverse)
+{
+    const auto input_tv   = miopen::get_inner_expanded_tv<5>(input.desc);
+    const auto output_tv  = miopen::get_inner_expanded_tv<5>(output.desc);
+    const auto doutput_tv = miopen::get_inner_expanded_tv<5>(doutput.desc);
+    const auto dinput_tv  = miopen::get_inner_expanded_tv<5>(ref_dinput.desc);
+
+    const auto size     = input.desc.GetElementSize();
+    const int ndims     = input.desc.GetNumDims();
+    const auto true_dim = ((dim % ndims) + ndims) % ndims;
+    const auto dim_size = input.desc.GetLengths()[true_dim];
+
+    auto log_grad_positive        = tensor<float>{input.desc.GetLengths()};
+    auto log_grad_negative        = tensor<float>{input.desc.GetLengths()};
+    auto pos_reverse_logcumsumexp = tensor<float>{input.desc.GetLengths()};
+    auto neg_reverse_logcumsumexp = tensor<float>{input.desc.GetLengths()};
+    const auto base_tv            = miopen::get_inner_expanded_tv<5>(log_grad_positive.desc);
+
+    // InitLogGrad
+    par_ford(size)([&](int idx) {
+        auto tensor_layout = tensor_layout_t<5>(base_tv, idx);
+
+        auto doutput_v = static_cast<float>(doutput[doutput_tv.get_tensor_view_idx(tensor_layout)]);
+        auto output_v  = static_cast<float>(output[output_tv.get_tensor_view_idx(tensor_layout)]);
+
+        if(!reverse ? tensor_layout.layout[true_dim] < exclusive
+                    : tensor_layout.layout[true_dim] + exclusive >= dim_size)
+            log_grad_positive[idx] = log_grad_negative[idx] =
+                -std::numeric_limits<float>::infinity();
+        else
+        {
+            log_grad_positive[idx] = (doutput_v > 0 ? std::log(doutput_v) - output_v
+                                                    : -std::numeric_limits<float>::infinity());
+            log_grad_negative[idx] = (doutput_v < 0 ? std::log(-doutput_v) - output_v
+                                                    : -std::numeric_limits<float>::infinity());
+        }
+    });
+
+    // LogCumSumExpForward pos_reverse_logcumsumexp
+    cpu_logcumsumexp_forward(
+        log_grad_positive, pos_reverse_logcumsumexp, dim, /*exclusive=*/false, reverse);
+
+    // LogCumSumExpForward neg_reverse_logcumsumexp
+    cpu_logcumsumexp_forward(
+        log_grad_negative, neg_reverse_logcumsumexp, dim, /*exclusive=*/false, reverse);
+
+    // LogCumSumExpBackwardStep2
+    par_ford(size)([&](int idx) {
+        auto tensor_layout = tensor_layout_t<5>(base_tv, idx);
+
+        if(reverse ? tensor_layout.layout[true_dim] < exclusive
+                   : tensor_layout.layout[true_dim] + exclusive >= dim_size)
+        {
+            ref_dinput[dinput_tv.get_tensor_view_idx(tensor_layout)] = static_cast<T>(0.0f);
+            return;
+        }
+        else
+            idx += (!reverse ? exclusive : -exclusive) * base_tv.stride[true_dim];
+
+        auto input_v = static_cast<float>(input[input_tv.get_tensor_view_idx(tensor_layout)]);
+
+        auto output_pos = std::exp(pos_reverse_logcumsumexp[idx] + input_v);
+        auto output_neg = std::exp(neg_reverse_logcumsumexp[idx] + input_v);
+
+        ref_dinput[dinput_tv.get_tensor_view_idx(tensor_layout)] =
+            static_cast<T>(output_pos - output_neg);
+    });
+}
@@ -0,0 +1,111 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "logcumsumexp.hpp"
+
+namespace logcumsumexp {
+struct GPU_LogCumSumExp_fwd_FP32 : LogCumSumExpTestFwd<float>
+{
+};
+
+struct GPU_LogCumSumExp_fwd_FP16 : LogCumSumExpTestFwd<half>
+{
+};
+
+struct GPU_LogCumSumExp_fwd_BFP16 : LogCumSumExpTestFwd<bfloat16>
+{
+};
+
+struct GPU_LogCumSumExp_bwd_FP32 : LogCumSumExpTestBwd<float>
+{
+};
+
+struct GPU_LogCumSumExp_bwd_FP16 : LogCumSumExpTestBwd<half>
+{
+};
+
+struct GPU_LogCumSumExp_bwd_BFP16 : LogCumSumExpTestBwd<bfloat16>
+{
+};
+
+} // namespace logcumsumexp
+using namespace logcumsumexp;
+
+TEST_P(GPU_LogCumSumExp_fwd_FP32, Test)
+{
+    RunTest();
+    Verify();
+};
+
+TEST_P(GPU_LogCumSumExp_fwd_FP16, Test)
+{
+    RunTest();
+    Verify();
+};
+
+TEST_P(GPU_LogCumSumExp_fwd_BFP16, Test)
+{
+    RunTest();
+    Verify();
+};
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_LogCumSumExp_fwd_FP32,
+                         testing::ValuesIn(LogCumSumTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_LogCumSumExp_fwd_FP16,
+                         testing::ValuesIn(LogCumSumTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_LogCumSumExp_fwd_BFP16,
+                         testing::ValuesIn(LogCumSumTestConfigs()));
+
+TEST_P(GPU_LogCumSumExp_bwd_FP32, Test)
+{
+    RunTest();
+    Verify();
+};
+
+TEST_P(GPU_LogCumSumExp_bwd_FP16, Test)
+{
+    RunTest();
+    Verify();
+};
+
+TEST_P(GPU_LogCumSumExp_bwd_BFP16, Test)
+{
+    RunTest();
+    Verify();
+};
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_LogCumSumExp_bwd_FP32,
+                         testing::ValuesIn(LogCumSumTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_LogCumSumExp_bwd_FP16,
+                         testing::ValuesIn(LogCumSumTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_LogCumSumExp_bwd_BFP16,
+                         testing::ValuesIn(LogCumSumTestConfigs()));
@@ -0,0 +1,279 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "cpu_logcumsumexp.hpp"
+#include "get_handle.hpp"
+#include "random.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+
+#include <gtest/gtest.h>
+#include <miopen/miopen.h>
+#include <miopen/logcumsumexp.hpp>
+#include <miopen/logcumsumexp/solvers.hpp>
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+struct LogCumSumExpTestCase
+{
+    std::vector<size_t> lengths;
+    int dim;
+    bool exclusive;
+    bool reverse;
+    bool contiguous;
+
+    friend std::ostream& operator<<(std::ostream& os, const LogCumSumExpTestCase& tc)
+    {
+        return os << " Lengths:" << tc.lengths << " Dim:" << tc.dim
+                  << " Exclusive:" << (tc.exclusive ? "True" : "False")
+                  << " Reverse:" << (tc.reverse ? "True" : "False")
+                  << " Contiguous:" << (tc.contiguous ? "True" : "False");
+    }
+};
+
+inline std::vector<size_t> GetStrides(std::vector<size_t> lengths, bool contiguous)
+{
+    if(!contiguous)
+        std::swap(lengths.front(), lengths.back());
+    std::vector<size_t> strides(lengths.size());
+    strides.back() = 1;
+    for(int i = lengths.size() - 2; i >= 0; --i)
+        strides[i] = strides[i + 1] * lengths[i + 1];
+    if(!contiguous)
+        std::swap(strides.front(), strides.back());
+    return strides;
+}
+
+inline std::vector<LogCumSumExpTestCase> LogCumSumTestConfigs()
+{
+    return {
+        {{1, 10}, 1, false, true, true},
+        {{1, 10}, 1, false, false, true},
+
+        {{1, 65}, 1, false, true, true},
+        {{1, 65}, 1, false, false, true},
+
+        {{65, 100}, 1, false, true, true},
+        {{65, 100}, 0, false, true, false},
+        {{65, 100}, 1, false, false, true},
+        {{65, 100}, 0, false, false, false},
+
+        {{70, 10}, 1, false, true, true},
+        {{70, 10}, 0, false, true, false},
+        {{70, 10}, 1, false, false, true},
+        {{70, 10}, 0, false, false, false},
+
+        {{512, 64, 112}, 2, false, true, true},
+        {{512, 64, 112}, 0, false, true, false},
+        {{512, 64, 112}, 2, false, false, true},
+        {{512, 64, 112}, 0, false, false, false},
+
+        {{1024, 7, 7, 1024}, 0, false, true, false},
+        {{1024, 7, 7, 1024}, 0, false, false, false},
+    };
+}
+
+template <typename T>
+struct LogCumSumExpTestFwd : public ::testing::TestWithParam<LogCumSumExpTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle       = get_handle();
+        logcumsumexp_config = GetParam();
+        auto gen_value      = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
+
+        auto lengths = logcumsumexp_config.lengths;
+
+        auto strides = GetStrides(lengths, logcumsumexp_config.contiguous);
+        input        = tensor<T>{lengths, strides}.generate(gen_value);
+
+        output     = tensor<T>{lengths, strides};
+        ref_output = tensor<T>{lengths, strides};
+
+        input_dev  = handle.Write(input.data);
+        output_dev = handle.Write(output.data);
+    }
+
+    void RunTest()
+    {
+        cpu_logcumsumexp_forward<T>(input,
+                                    ref_output,
+                                    logcumsumexp_config.dim,
+                                    logcumsumexp_config.exclusive,
+                                    logcumsumexp_config.reverse);
+
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+
+        status = miopen::logcumsumexp::LogCumSumExpForward(handle,
+                                                           input.desc,
+                                                           input_dev.get(),
+                                                           output.desc,
+                                                           output_dev.get(),
+                                                           logcumsumexp_config.dim,
+                                                           logcumsumexp_config.exclusive,
+                                                           logcumsumexp_config.reverse);
+        EXPECT_EQ(status, miopenStatusSuccess);
+        output.data = handle.Read<T>(output_dev, output.data.size());
+    }
+
+    double GetTolerance()
+    {
+        double tolerance = std::numeric_limits<T>::epsilon() * 10;
+        return tolerance;
+    }
+
+    void Verify()
+    {
+        double tolerance = GetTolerance();
+
+        auto error_output = miopen::rms_range(ref_output, output);
+        ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output));
+        EXPECT_LT(error_output, tolerance)
+            << "Error forward Output beyond tolerance Error: " << error_output
+            << " Tolerance: " << tolerance;
+    }
+
+    LogCumSumExpTestCase logcumsumexp_config;
+
+    tensor<T> input;
+    tensor<T> output;
+
+    tensor<T> ref_output;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+};
+
+template <typename T>
+struct LogCumSumExpTestBwd : public ::testing::TestWithParam<LogCumSumExpTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle        = get_handle();
+        logcumsumexp_config  = GetParam();
+        auto gen_value_input = [](auto...) {
+            return prng::gen_descreet_uniform_sign<T>(1e-2, 100);
+        };
+        auto gen_value_doutput = [](auto...) {
+            return prng::gen_descreet_uniform_sign<T>(1e-2, 100);
+        };
+
+        auto lengths = logcumsumexp_config.lengths;
+
+        auto strides = GetStrides(lengths, logcumsumexp_config.contiguous);
+        input        = tensor<T>{lengths, strides}.generate(gen_value_input);
+        dinput       = tensor<T>{lengths, strides};
+
+        output  = tensor<T>{lengths, strides};
+        doutput = tensor<T>{lengths, strides}.generate(gen_value_doutput);
+
+        // Calculate output tensor value by forwarding input tensor
+        cpu_logcumsumexp_forward(input,
+                                 output,
+                                 logcumsumexp_config.dim,
+                                 logcumsumexp_config.exclusive,
+                                 logcumsumexp_config.reverse);
+
+        ref_dinput = tensor<T>{lengths, strides};
+
+        input_dev   = handle.Write(input.data);
+        output_dev  = handle.Write(output.data);
+        doutput_dev = handle.Write(doutput.data);
+        dinput_dev  = handle.Write(dinput.data);
+    }
+
+    void RunTest()
+    {
+        cpu_logcumsumexp_backward<T>(input,
+                                     output,
+                                     doutput,
+                                     ref_dinput,
+                                     logcumsumexp_config.dim,
+                                     logcumsumexp_config.exclusive,
+                                     logcumsumexp_config.reverse);
+
+        auto&& handle = get_handle();
+        auto status   = miopen::logcumsumexp::LogCumSumExpBackward(handle,
+                                                                 input.desc,
+                                                                 input_dev.get(),
+                                                                 output.desc,
+                                                                 output_dev.get(),
+                                                                 doutput.desc,
+                                                                 doutput_dev.get(),
+                                                                 dinput.desc,
+                                                                 dinput_dev.get(),
+                                                                 logcumsumexp_config.dim,
+                                                                 logcumsumexp_config.exclusive,
+                                                                 logcumsumexp_config.reverse);
+        EXPECT_EQ(status, miopenStatusSuccess);
+        dinput.data = handle.Read<T>(dinput_dev, dinput.data.size());
+    }
+
+    double GetTolerance()
+    {
+        double tolerance = std::numeric_limits<T>::epsilon() * 10;
+        return tolerance;
+    }
+
+    void Verify()
+    {
+        double tolerance = GetTolerance();
+
+        auto error_dinput = miopen::rms_range(ref_dinput, dinput);
+        ASSERT_EQ(miopen::range_distance(ref_dinput), miopen::range_distance(dinput));
+        EXPECT_LT(error_dinput, tolerance)
+            << "Error backward Input Gradient beyond tolerance Error: " << error_dinput
+            << " Tolerance: " << tolerance;
+    }
+
+    LogCumSumExpTestCase logcumsumexp_config;
+
+    tensor<T> input;
+    tensor<T> output;
+    tensor<T> doutput;
+    tensor<T> dinput;
+
+    tensor<T> ref_dinput;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+    miopen::Allocator::ManageDataPtr doutput_dev;
+    miopen::Allocator::ManageDataPtr dinput_dev;
+};
@@ -133,6 +133,10 @@ struct square_diff_fn
     template <class T, class U>
     double operator()(T x, U y) const
     {
+        // In case (x, y) = (+/-inf, +/-inf)
+        if(!std::isnan(x) && !std::isnan(y) && std::isnan(x - y))
+            return 0;
+
         double diff = static_cast<double>(x - y);
         return diff * diff;
     }