intel
diff --git a/‎.gitmodules
Lines changed: 3 additions & 0 deletions b/‎.gitmodules
Lines changed: 3 additions & 0 deletions
diff --git a/‎csrc/cpu/CMakeLists.txt
Lines changed: 17 additions & 3 deletions b/‎csrc/cpu/CMakeLists.txt
Lines changed: 17 additions & 3 deletions
diff --git a/‎csrc/cpu/tpp/CMakeLists.txt
Lines changed: 6 additions & 0 deletions b/‎csrc/cpu/tpp/CMakeLists.txt
Lines changed: 6 additions & 0 deletions
diff --git a/‎csrc/cpu/tpp/README.md
Lines changed: 31 additions & 0 deletions b/‎csrc/cpu/tpp/README.md
Lines changed: 31 additions & 0 deletions
diff --git a/‎csrc/cpu/tpp/bert/fused_bert.cpp
Lines changed: 276 additions & 0 deletions b/‎csrc/cpu/tpp/bert/fused_bert.cpp
Lines changed: 276 additions & 0 deletions
@@ -4,3 +4,6 @@
 [submodule "third_party/ideep"]
 	path = third_party/ideep
 	url = https://github.com/intel/ideep.git
+[submodule "third_party/libxsmm"]
+	path = third_party/libxsmm
+	url = https://github.com/libxsmm/libxsmm.git
@@ -46,6 +46,7 @@ set(IPEX_CPU_CPP_ISA_SRCS)
 set(IPEX_CPU_CPP_TOOLKIT_SRCS)
 set(IPEX_CPU_CPP_IDEEP_SRCS)
 set(IPEX_CPU_CPP_RUNTIME_SRCS)
+set (IPEX_CPU_CPP_TPP_SRCS)
 
 set(IPEX_JIT_CPP_SRCS)
 set(IPEX_UTLIS_CPP_SRCS)
@@ -62,13 +63,13 @@ add_subdirectory(${IPEX_CPU_ROOT_DIR}/isa)
 add_subdirectory(${IPEX_CPU_ROOT_DIR}/toolkit)
 add_subdirectory(${IPEX_CPU_ROOT_DIR}/runtime)
 add_subdirectory(${IPEX_CPU_ROOT_DIR}/utils)
+add_subdirectory(${IPEX_CPU_ROOT_DIR}/tpp)
 
 add_subdirectory(${IPEX_JIT_CPP_ROOT} jit_cpu)
 add_subdirectory(${IPEX_UTLIS_CPP_ROOT} csrc_utlis)
 
 set(IPEX_CPU_CPP_SRCS ${IPEX_CPU_CPP_DYNDISP_SRCS} ${IPEX_CPU_CPP_ISA_SRCS_GEN} ${IPEX_CPU_CPP_UTILS_SRCS} ${IPEX_CPU_CPP_QUANTIZATION_SRCS} ${IPEX_JIT_CPP_SRCS}
-    ${IPEX_CPU_CPP_ISA_SRCS} ${IPEX_CPU_CPP_IDEEP_SRCS} ${IPEX_CPU_CPP_AUTOCAST_SRCS} ${IPEX_CPU_CPP_ATEN_SRCS} ${IPEX_CPU_CPP_RUNTIME_SRCS} ${IPEX_UTLIS_CPP_SRCS}
-    ${IPEX_CPU_CPP_TOOLKIT_SRCS})
+    ${IPEX_CPU_CPP_ISA_SRCS} ${IPEX_CPU_CPP_IDEEP_SRCS} ${IPEX_CPU_CPP_AUTOCAST_SRCS} ${IPEX_CPU_CPP_ATEN_SRCS} ${IPEX_CPU_CPP_RUNTIME_SRCS} ${IPEX_CPU_CPP_TOOLKIT_SRCS} ${IPEX_UTLIS_CPP_SRCS} ${IPEX_CPU_CPP_TPP_SRCS})
 
 list(REMOVE_ITEM IPEX_CPU_CPP_SRCS ${IPEX_CPU_CPP_ISA_SRCS_ORIGIN})
 
@@ -81,10 +82,12 @@ target_include_directories(${PLUGIN_NAME_CPU} PUBLIC ${IPEX_ROOT_DIR})
 target_include_directories(${PLUGIN_NAME_CPU} PUBLIC ${IPEX_CPU_ROOT_DIR})
 target_include_directories(${PLUGIN_NAME_CPU} PUBLIC ${IPEX_CPU_ROOT_DIR}/aten)
 target_include_directories(${PLUGIN_NAME_CPU} PUBLIC ${IPEX_CPU_ROOT_DIR}/utils)
+target_include_directories(${PLUGIN_NAME_CPU} PUBLIC ${IPEX_CPU_ROOT_DIR}/tpp)
 
 target_include_directories(${PLUGIN_NAME_CPU} PUBLIC ${IPEX_JIT_CPP_ROOT})
 target_include_directories(${PLUGIN_NAME_CPU} PUBLIC ${IPEX_UTLIS_CPP_ROOT})
 
+target_include_directories(${PLUGIN_NAME_CPU} PUBLIC ${IPEX_CPU_CPP_THIRD_PARTY_ROOT}/libxsmm/include)
 target_include_directories(${PLUGIN_NAME_CPU} PUBLIC ${IPEX_CPU_CPP_THIRD_PARTY_ROOT}/ideep/mkl-dnn/include)
 target_include_directories(${PLUGIN_NAME_CPU} PUBLIC ${IPEX_CPU_CPP_THIRD_PARTY_ROOT}/ideep/mkl-dnn/third_party/oneDNN/include)
 # TODO: once llga is merged into oneDNN, use oneDNN directly as the third_party instead of using that inside llga
@@ -101,6 +104,18 @@ if(CLANG_FORMAT)
   add_dependencies(${PLUGIN_NAME_CPU} CL_FORMAT_CPU_NATIVE_CSRC)
 endif()
 
+include(${CMAKE_ROOT}/Modules/ExternalProject.cmake)
+ExternalProject_Add(libxsmm
+  SOURCE_DIR ${IPEX_CPU_CPP_THIRD_PARTY_ROOT}/libxsmm
+  BUILD_IN_SOURCE 1
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND
+    make
+    "AVX=3"
+    "-j"
+  INSTALL_COMMAND ""
+  )
+target_link_libraries(${PLUGIN_NAME_CPU} PRIVATE ${IPEX_CPU_CPP_THIRD_PARTY_ROOT}/libxsmm/lib/libxsmm.a)
 add_dependencies(${PLUGIN_NAME_CPU} dnnl_graph)
 # If Graph Compiler is built, then it should link to its LLVM dependencies,
 # and not the LLVM symbols exposed by PyTorch.
@@ -114,7 +129,6 @@ if (DEFINED ENV{DNNL_GRAPH_BUILD_COMPILER_BACKEND})
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--exclude-libs=${DNNL_GRAPHCOMPILER_LLVM_LIB_EXCLUDE}")
   endif()
 endif()
-
 find_package(oneMKL QUIET)
 if (ONEMKL_FOUND)
   target_include_directories(${PLUGIN_NAME_CPU} PUBLIC ${ONEMKL_INCLUDE_DIR})
 
@@ -0,0 +1,6 @@
+FILE(GLOB _TPP_SRCS *.cpp bert/*.cpp)
+LIST(APPEND IPEX_CPU_CPP_TPP_SRCS ${_TPP_SRCS})
+# LIST(APPEND IPEX_CPU_CPP_ATEN_SRCS ${_CPU_KERNELS_SRCS})
+message(STATUS "IPEX_CPU_CPP_TPP_SRCS: ${IPEX_CPU_CPP_TPP_SRCS}") 
+# Pass to parent
+set(IPEX_CPU_CPP_TPP_SRCS ${IPEX_CPU_CPP_TPP_SRCS} PARENT_SCOPE)
@@ -0,0 +1,31 @@
+# Overview
+This directory mainly includes the tpp related tools and fused kernel implementation based on tpp optimization. 
+
+```
+├── bert #fused kernel based on tpp 
+│   ├── fused_bert.cpp
+│   ├── fused_dense_dropout_layernorm_bwd_tmpl.h #backard for fused linear+dropout+layernorm
+│   ├── fused_dense_dropout_layernorm_fwd_tmpl.h #forward for fused linear+dropout+layernorm
+│   ├── fused_dense_gelu_bwd_tmpl.h #backward for fused linear+gelu
+│   ├── fused_dense_gelu_fwd_tmpl.h #forward for fused linear+gelu
+│   ├── fused_embedding_layernorm_dropout_bwd_tmpl.h #forward for fused embeeding+add+layernorm+dropout 
+│   ├── fused_embedding_layernorm_dropout_fwd_tmpl.h #backard for fused embeeding+add+layernorm+dropout 
+│   ├── fused_self_attention_bwd_tmpl.h #fused backward self-attention 
+│   └── fused_self_attention_fwd_tmpl.h #fused forward self-attention
+├── CMakeLists.txt
+├── common_loops.cpp #loops generation and tuning 
+├── ext_tpp.h
+├── init.cpp
+├── jit_compile.cpp
+├── jit_compile.h
+├── optim.cpp
+├── optim.h
+├── par_loop_generator.cpp #loops generation and tuning 
+├── par_loop_generator.h #loops generation and tuning 
+├── rtm.h
+├── tensor_helper.h
+├── threaded_loops.h
+├── timing.h
+├── utils.h
+└── xsmm_functors.h #the tpp definition based on libxsmm
+```
@@ -0,0 +1,276 @@
+
+#include <ATen/record_function.h>
+//#include <torch/csrc/autograd/VariableTypeUtils.h>
+//#include <torch/extension.h>
+
+#include <dyndisp/DispatchStub.h>
+#include <torch/all.h>
+#include <iostream>
+#include <vector>
+#include "ext_tpp.h"
+//#include "init.h"
+#include "tensor_helper.h"
+#include "threaded_loops.h"
+#include "timing.h"
+#include "xsmm_functors.h"
+
+namespace torch_ipex {
+namespace tpp {
+
+static int my_rank = guess_mpi_rank();
+
+REGISTER_LOCAL_SCOPE(b_emb, "b_emb");
+REGISTER_LOCAL_SCOPE(q_gemm, "q_gemm");
+REGISTER_LOCAL_SCOPE(k_gemm, "k_gemm");
+REGISTER_LOCAL_SCOPE(v_gemm, "v_gemm");
+REGISTER_LOCAL_SCOPE(ac_gemm, "ac_gemm");
+REGISTER_LOCAL_SCOPE(o_gemm, "o_gemm");
+REGISTER_LOCAL_SCOPE(i_gemm, "i_gemm");
+
+REGISTER_LOCAL_SCOPE(db_emb, "db_emb");
+REGISTER_LOCAL_SCOPE(diq_gemm, "diq_gemm");
+REGISTER_LOCAL_SCOPE(dik_gemm, "dik_gemm");
+REGISTER_LOCAL_SCOPE(div_gemm, "div_gemm");
+REGISTER_LOCAL_SCOPE(dica_gemm, "dica_gemm");
+REGISTER_LOCAL_SCOPE(dii_gemm, "dii_gemm");
+REGISTER_LOCAL_SCOPE(dio_gemm, "dio_gemm");
+REGISTER_LOCAL_SCOPE(dwqkv_gemm, "dwqkv_gemm");
+REGISTER_LOCAL_SCOPE(dwq_gemm, "dwq_gemm");
+REGISTER_LOCAL_SCOPE(dwk_gemm, "dwk_gemm");
+REGISTER_LOCAL_SCOPE(dwv_gemm, "dwv_gemm");
+REGISTER_LOCAL_SCOPE(dwa_gemm, "dwa_gemm");
+REGISTER_LOCAL_SCOPE(dwc_gemm, "dwc_gemm");
+REGISTER_LOCAL_SCOPE(dac_gemm, "dac_gemm");
+REGISTER_LOCAL_SCOPE(dwi_gemm, "dwi_gemm");
+REGISTER_LOCAL_SCOPE(dwo_gemm, "dwo_gemm");
+REGISTER_LOCAL_SCOPE(dqkv_bias, "dqkv_bias");
+REGISTER_LOCAL_SCOPE(di_bias, "di_bias");
+REGISTER_LOCAL_SCOPE(do_bias, "do_bias");
+
+template <typename T>
+inline void omp_reduce_buf(
+    int num_threads,
+    int N,
+    float** ptrs,
+    T* buf,
+    bool accumulate = false) {
+  ScopedTimer _t(EW_RED);
+#pragma omp for
+  for (int i = 0; i < N; i++) {
+    float sum = 0.0;
+    for (int j = 0; j < num_threads; j++) {
+      sum += ptrs[j][i];
+    }
+    if (accumulate) {
+      buf[i] += sum;
+    } else {
+      buf[i] = sum;
+    }
+  }
+}
+
+static std::vector<at::Tensor> fused_self_attention_fwd_unpad(
+    double p,
+    std::vector<at::Tensor> inputs,
+    bool training) {
+  GlobalPass _gp(FWD);
+  if (inputs[6].dtype() == at::kFloat) {
+    typedef float T;
+#include "fused_self_attention_fwd_tmpl.h"
+  } else {
+    typedef bfloat16 T;
+#include "fused_self_attention_fwd_tmpl.h"
+  }
+}
+
+static std::vector<at::Tensor> fused_self_attention_bwd_unpad(
+    double p,
+    std::vector<at::Tensor> inputs) {
+  GlobalPass _gp(BWD);
+  if (inputs[0].dtype() == at::kFloat) {
+    typedef float T;
+#include "fused_self_attention_bwd_tmpl.h"
+  } else {
+    typedef bfloat16 T;
+#include "fused_self_attention_bwd_tmpl.h"
+  }
+}
+
+static std::vector<at::Tensor> fused_dense_dropout_layernorm_fwd_unpad(
+    double p,
+    double eps,
+    std::vector<at::Tensor> inputs,
+    bool training) {
+  GlobalPass _gp(FWD);
+  if (inputs[0].dtype() == at::kFloat) {
+    typedef float T;
+#include "fused_dense_dropout_layernorm_fwd_tmpl.h"
+  } else {
+    typedef bfloat16 T;
+#include "fused_dense_dropout_layernorm_fwd_tmpl.h"
+  }
+}
+
+static std::vector<at::Tensor> fused_dense_dropout_layernorm_bwd_unpad(
+    double p,
+    std::vector<at::Tensor> inputs) {
+  GlobalPass _gp(BWD);
+  if (inputs[0].dtype() == at::kFloat) {
+    typedef float T;
+#include "fused_dense_dropout_layernorm_bwd_tmpl.h"
+  } else {
+    typedef bfloat16 T;
+#include "fused_dense_dropout_layernorm_bwd_tmpl.h"
+  }
+}
+
+static std::vector<at::Tensor> fused_dense_gelu_fwd_unpad(
+    at::Tensor t_in,
+    at::Tensor t_wt,
+    at::Tensor t_bias,
+    bool training) {
+  GlobalPass _gp(FWD);
+  if (t_in.dtype() == at::kFloat) {
+    typedef float T;
+#include "fused_dense_gelu_fwd_tmpl.h"
+  } else {
+    typedef bfloat16 T;
+#include "fused_dense_gelu_fwd_tmpl.h"
+  }
+}
+
+static std::vector<at::Tensor> fused_dense_gelu_bwd_unpad(
+    at::Tensor t_grad_out,
+    at::Tensor t_gelu_in,
+    at::Tensor t_in,
+    at::Tensor t_wt) {
+  GlobalPass _gp(BWD);
+  if (t_grad_out.dtype() == at::kFloat) {
+    typedef float T;
+#include "fused_dense_gelu_bwd_tmpl.h"
+  } else {
+    typedef bfloat16 T;
+#include "fused_dense_gelu_bwd_tmpl.h"
+  }
+}
+
+static std::vector<at::Tensor> fused_embedding_layernorm_dropout_fwd_unpad(
+    double p,
+    double eps,
+    long H,
+    long pad_id,
+    std::vector<at::Tensor> inputs,
+    bool training) {
+  GlobalPass _gp(FWD);
+  if (inputs[4].dtype() == at::kFloat && inputs[6].dtype() == at::kFloat) {
+    typedef float T;
+    typedef float ET;
+#include "fused_embedding_layernorm_dropout_fwd_tmpl.h"
+  } else if (
+      inputs[4].dtype() == at::kBFloat16 && inputs[6].dtype() == at::kFloat) {
+    typedef bfloat16 T;
+    typedef float ET;
+#include "fused_embedding_layernorm_dropout_fwd_tmpl.h"
+  } else if (
+      inputs[4].dtype() == at::kFloat && inputs[6].dtype() == at::kBFloat16) {
+    typedef float T;
+    typedef bfloat16 ET;
+#include "fused_embedding_layernorm_dropout_fwd_tmpl.h"
+  } else if (
+      inputs[4].dtype() == at::kBFloat16 &&
+      inputs[6].dtype() == at::kBFloat16) {
+    typedef bfloat16 T;
+    typedef bfloat16 ET;
+#include "fused_embedding_layernorm_dropout_fwd_tmpl.h"
+  } else {
+    PCL_ASSERT(0, "Should not come here\n");
+  }
+}
+
+static std::vector<at::Tensor> fused_embedding_layernorm_dropout_bwd_unpad(
+    double p,
+    long pad_id,
+    std::vector<at::Tensor> inputs) {
+  GlobalPass _gp(BWD);
+  if (inputs[0].dtype() == at::kFloat && inputs[6].dtype() == at::kFloat) {
+    typedef float T;
+    typedef float ET;
+#include "fused_embedding_layernorm_dropout_bwd_tmpl.h"
+  } else if (
+      inputs[0].dtype() == at::kBFloat16 && inputs[6].dtype() == at::kFloat) {
+    typedef bfloat16 T;
+    typedef float ET;
+#include "fused_embedding_layernorm_dropout_bwd_tmpl.h"
+  } else if (
+      inputs[0].dtype() == at::kFloat && inputs[6].dtype() == at::kBFloat16) {
+    typedef float T;
+    typedef bfloat16 ET;
+#include "fused_embedding_layernorm_dropout_bwd_tmpl.h"
+  } else if (
+      inputs[0].dtype() == at::kBFloat16 &&
+      inputs[6].dtype() == at::kBFloat16) {
+    typedef bfloat16 T;
+    typedef bfloat16 ET;
+#include "fused_embedding_layernorm_dropout_bwd_tmpl.h"
+  } else {
+    PCL_ASSERT(0, "Should not come here\n");
+  }
+}
+} // namespace tpp
+} // namespace torch_ipex
+namespace {
+TORCH_LIBRARY_FRAGMENT(torch_ipex, m) {
+  m.def(
+      torch::schema(
+          "torch_ipex::fused_self_attention_fwd_unpad(float p, Tensor[] inputs,  bool training) -> Tensor[]",
+          c10::AliasAnalysisKind::PURE_FUNCTION),
+      torch_ipex::tpp::fused_self_attention_fwd_unpad);
+
+  m.def(
+      torch::schema(
+          "torch_ipex::fused_self_attention_bwd_unpad(float p, Tensor[] inputs) -> Tensor[]",
+          c10::AliasAnalysisKind::PURE_FUNCTION),
+      torch_ipex::tpp::fused_self_attention_bwd_unpad);
+
+  m.def(
+      torch::schema(
+          "torch_ipex::fused_dense_dropout_layernorm_fwd_unpad(float p, float eps, Tensor[] inputs, bool training) -> Tensor[]",
+          c10::AliasAnalysisKind::PURE_FUNCTION),
+      torch_ipex::tpp::fused_dense_dropout_layernorm_fwd_unpad);
+
+  m.def(
+      torch::schema(
+          "torch_ipex::fused_dense_dropout_layernorm_bwd_unpad(float p, Tensor[] inputs) -> Tensor[]",
+          c10::AliasAnalysisKind::PURE_FUNCTION),
+      torch_ipex::tpp::fused_dense_dropout_layernorm_bwd_unpad);
+
+  m.def(
+      torch::schema(
+          "torch_ipex::fused_dense_gelu_fwd_unpad(Tensor t_in,  Tensor t_wt,  Tensor "
+          "t_bias, bool training)->Tensor[] ",
+          c10::AliasAnalysisKind::PURE_FUNCTION),
+      torch_ipex::tpp::fused_dense_gelu_fwd_unpad);
+
+  m.def(
+      torch::schema(
+          "torch_ipex::fused_dense_gelu_bwd_unpad(Tensor t_grad_out,  Tensor t_gelu_in,"
+          "Tensor t_in, Tensor t_wt) -> Tensor[]",
+          c10::AliasAnalysisKind::PURE_FUNCTION),
+      torch_ipex::tpp::fused_dense_gelu_bwd_unpad);
+
+  m.def(
+      torch::schema(
+          "torch_ipex::fused_embedding_layernorm_dropout_fwd_unpad(float p, float "
+          "eps, int H, int pad_id, Tensor(a!)[] inputs,  bool training) ->"
+          "Tensor[]",
+          c10::AliasAnalysisKind::PURE_FUNCTION),
+      torch_ipex::tpp::fused_embedding_layernorm_dropout_fwd_unpad);
+
+  m.def(
+      torch::schema(
+          "torch_ipex::fused_embedding_layernorm_dropout_bwd_unpad(float p, int "
+          "pad_id, Tensor(a!)[] inputs)->Tensor[] ",
+          c10::AliasAnalysisKind::PURE_FUNCTION),
+      torch_ipex::tpp::fused_embedding_layernorm_dropout_bwd_unpad);
+}
+} // namespace