FreeScienceCommunity
diff --git a/‎aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h‎
Lines changed: 4 additions & 1 deletion b/‎aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎caffe2/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎caffe2/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/cpp/jit/test_gpu.cpp‎
Lines changed: 814 additions & 283 deletions b/‎test/cpp/jit/test_gpu.cpp‎
Lines changed: 814 additions & 283 deletions
diff --git a/‎test/cpp/jit/tests.h‎
Lines changed: 6 additions & 3 deletions b/‎test/cpp/jit/tests.h‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎test/test_jit_cuda_fuser.py‎
Lines changed: 58 additions & 1 deletion b/‎test/test_jit_cuda_fuser.py‎
Lines changed: 58 additions & 1 deletion
diff --git a/‎tools/build_variables.bzl‎
Lines changed: 1 addition & 0 deletions b/‎tools/build_variables.bzl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/arith.cpp‎
Lines changed: 114 additions & 11 deletions b/‎torch/csrc/jit/codegen/cuda/arith.cpp‎
Lines changed: 114 additions & 11 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/arith.h‎
Lines changed: 15 additions & 1 deletion b/‎torch/csrc/jit/codegen/cuda/arith.h‎
Lines changed: 15 additions & 1 deletion
@@ -48,7 +48,10 @@ namespace at { namespace cuda {
   _(cuLaunchKernel)                              \
   _(cuCtxGetCurrent)                             \
   _(cuModuleUnload)                              \
-  _(cuDevicePrimaryCtxGetState)
+  _(cuDevicePrimaryCtxGetState)                  \
+  _(cuLinkCreate)                                \
+  _(cuLinkAddData)                               \
+  _(cuLinkComplete)
 
 #else
 
 
@@ -429,6 +429,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/tensor_view.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/transform_iter.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/transform_replay.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/transform_rfactor.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/type.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/utils.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/register_interface.cpp
 
@@ -113,10 +113,10 @@ namespace jit {
   _(GPU_FusionTVReorder)         \
   _(GPU_FusionEquality)          \
   _(GPU_FusionReplaceAll)        \
+  _(GPU_FusionParser)            \
   _(GPU_FusionDependency)        \
   _(GPU_FusionCodeGen)           \
   _(GPU_FusionCodeGen2)          \
-  _(GPU_FusionCodeGen3)          \
   _(GPU_FusionSimplePWise)       \
   _(GPU_FusionExecKernel)        \
   _(GPU_FusionForLoop)           \
@@ -125,8 +125,11 @@ namespace jit {
   _(GPU_FusionBinaryOps)         \
   _(GPU_FusionTernaryOps)        \
   _(GPU_FusionCompoundOps)       \
-  _(GPU_FusionCastOps)
-//_(GPU_FusionCodeGen4)
+  _(GPU_FusionAdvancedComputeAt) \
+  _(GPU_FusionScalarInputs)      \
+  _(GPU_FusionRFactorReplay)     \
+  _(GPU_FusionReduction)         \
+  _(GPU_FusionReduction2)
 #else
 #define TH_FORALL_TESTS_CUDA(_) \
   _(ArgumentSpec)               \
 
@@ -4,10 +4,12 @@
 from __future__ import unicode_literals
 
 import unittest
+import os
 
 import torch
 
 from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR, skipIfRocm
+from torch.testing._internal.codegen.random_topo_test import runDefaultTestWithSeed
 
 from test_jit import JitTestCase, RUN_CUDA
 
@@ -52,6 +54,31 @@ def _has_cuda_fusion_group(self, graph):
                 has_cuda_fusion_group = True
         return has_cuda_fusion_group
 
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires profiling node to run cuda fuser")
+    @skipIfRocm
+    def test_half(self):
+        def t(x : torch.Tensor, y : torch.Tensor, z : torch.Tensor, alpha : float):
+            o_16 = torch.add(x, y)
+            o_32_a = torch.add(y, z, alpha=alpha)
+            o_32_b = torch.add(o_16, z)
+            return (o_16, o_32_a, o_32_b)
+
+        t_jit = torch.jit.script(t)
+        alpha = 0.5
+        # stick to integers, this avoid the numerical difference due to our
+        # promotion
+        x = torch.randint(0, 256, (4, 8)).to(dtype=torch.float16, device="cuda")
+        y = torch.randint(0, 256, (4, 8)).to(dtype=torch.float16, device="cuda")
+        z = torch.randint(0, 256, (4, 8)).to(dtype=torch.float16, device="cuda")
+        jit_o = t_jit(x, y, z, alpha)
+        jit_o = t_jit(x, y, z, alpha)
+        o = t(x, y, z, alpha)
+        for oo, jit_oo in zip(o, jit_o):
+            self.assertEqual(oo.dtype, jit_oo.dtype)
+            self.assertEqual(oo, jit_oo)
+        self.assertTrue(self._has_cuda_fusion_group(t_jit.graph_for(x, y, z, alpha)))
+
     @unittest.skipIf(not RUN_CUDA, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires profiling node to run cuda fuser")
     @skipIfRocm
@@ -149,7 +176,6 @@ def t(x : torch.Tensor, y : torch.Tensor, z : torch.Tensor):
         # Currently cannot fuse this
         self.assertTrue(self._has_cuda_fusion_group(t_jit.graph_for(x, y, z)))
 
-    @unittest.skipIf(True, "temporary disable for buggy codegen")
     @unittest.skipIf(not RUN_CUDA, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires profiling node to run cuda fuser")
     @skipIfRocm
@@ -299,5 +325,36 @@ def where(x : torch.Tensor, y : torch.Tensor, cond : torch.Tensor):
         where_jit = torch.jit.script(where)
         self._run_helper(where_jit, where, True, x, y, cond)
 
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.LEGACY, "Requires profiling node to run cuda fuser")
+    @skipIfRocm
+    def test_dynamic_size(self):
+        def t(x : torch.Tensor, y : torch.Tensor, z : float):
+            o = x + y
+            o = o + z
+            return o
+        t_jit = torch.jit.script(t)
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(32, 32, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x, y, 2.0)
+        jit_o = t_jit(x, y, 2.0)
+        o = t(x, y, 2.0)
+        self.assertEqual(o, jit_o)
+        self.assertTrue(self._has_cuda_fusion_group(t_jit.graph_for(x, y, 2.0)))
+        x = torch.randn(8, 32, 16, 8, dtype=torch.float, device="cuda")
+        y = torch.randn(16, 8, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x, y, 2.0)
+        o = t(x, y, 2.0)
+        self.assertEqual(o, jit_o)
+        x = torch.randn(8, 17, 8, dtype=torch.float, device="cuda")
+        y = torch.randn(8, 17, 1, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x, y, 2.0)
+
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @skipIfRocm
+    def test_random_topo(self):
+        os.environ["PYTORCH_CUDA_FUSER_DISABLE_FALLBACK"] = "1"
+        self.assertTrue(runDefaultTestWithSeed(28449))
+
 if __name__ == '__main__':
     run_tests()
@@ -314,6 +314,7 @@ libtorch_cuda_sources = [
     "torch/csrc/jit/codegen/cuda/tensor_view.cpp",
     "torch/csrc/jit/codegen/cuda/transform_iter.cpp",
     "torch/csrc/jit/codegen/cuda/transform_replay.cpp",
+    "torch/csrc/jit/codegen/cuda/transform_rfactor.cpp",
     "torch/csrc/jit/codegen/cuda/type.cpp",
     "torch/csrc/jit/codegen/cuda/utils.cpp",
     "torch/csrc/jit/codegen/cuda/register_interface.cpp",
 
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <c10/util/Exception.h>
-#include <torch/csrc/jit/codegen/cuda/ir_internal_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/type.h>
 
 namespace torch {
 namespace jit {
@@ -61,12 +62,42 @@ TORCH_CUDA_API Val* promoteNew(Val* v1, Val* v2) {
   return newValLike(v1, out_dtype);
 }
 
+Val* newConstScalar(DataType dtype, int val) {
+  switch (dtype) {
+    case (DataType::Int):
+      return new Int(val);
+    default:
+      break;
+  }
+  TORCH_CHECK(
+      false,
+      "Could not generate a new Scalar with data type ",
+      dtype,
+      "and constant value: ",
+      val);
+}
+
+Val* newConstScalar(DataType dtype, float val) {
+  switch (dtype) {
+    case (DataType::Float):
+      return new Float(val);
+    default:
+      break;
+  }
+  TORCH_CHECK(
+      false,
+      "Could not generate a new Scalar with data type ",
+      dtype,
+      "and constant value: ",
+      val);
+}
+
 TORCH_CUDA_API Val* castOp(DataType dtype, Val* v1) {
   if (v1->getDataType().value() == dtype)
     return v1;
 
-  auto uop_type = cast_type(v1->getDataType().value(), dtype);
-  if (uop_type == c10::nullopt) {
+  if (cast_func_str(std::make_pair(v1->getDataType().value(), dtype)) ==
+      c10::nullopt) {
     TORCH_CHECK(
         false,
         "Illegal Cast value from  DataType: ",
@@ -76,16 +107,20 @@ TORCH_CUDA_API Val* castOp(DataType dtype, Val* v1) {
   }
 
   Val* out = newValLike(v1, dtype);
-  Statement* expr = new UnaryOp(uop_type.value(), out, v1);
+  new UnaryOp(UnaryOpType::Cast, out, v1);
   return out;
 }
 
+// UNARY OPERATIONS
+
 TORCH_CUDA_API Val* unaryOp(UnaryOpType type, Val* v1) {
   Val* out = newValLike(v1);
-  Statement* expr = new UnaryOp(type, out, v1);
+  new UnaryOp(type, out, v1);
   return out;
 }
 
+// BINARY OPERATIONS
+
 TORCH_CUDA_API Val* binaryOp(BinaryOpType type, Val* v1, Val* v2) {
   Val* out = promoteNew(v1, v2);
   if (is_logical_op(type)) {
@@ -95,7 +130,7 @@ TORCH_CUDA_API Val* binaryOp(BinaryOpType type, Val* v1, Val* v2) {
     if (out->getDataType().value() != DataType::Int)
       out = newValLike(out, DataType::Int);
   }
-  Statement* expr = new BinaryOp(type, out, v1, v2);
+  new BinaryOp(type, out, v1, v2);
   return out;
 }
 
@@ -139,6 +174,72 @@ TORCH_CUDA_API Val* andOp(Val* v1, Val* v2) {
   return binaryOp(BinaryOpType::And, v1, v2);
 }
 
+// REDUCTION OPERATIONS
+
+Val* reductionOp(
+    BinaryOpType reduction_op_type,
+    const std::vector<int>& axes,
+    Val* init,
+    Val* v1) {
+  TORCH_CHECK(
+      v1->getValType().value() == ValType::TensorView,
+      "Cannot reduce on values that are not TensorViews, but recieved type ",
+      v1->getValType().value());
+
+  TORCH_CHECK(
+      init->isConstScalar(),
+      "Cannot create a reduction operation where the initial value is not a const scalar.");
+
+  TensorView* tv = static_cast<TensorView*>(v1);
+
+  TORCH_CHECK(
+      tv->getRootDomain() == tv->domain(),
+      "Reducing a tensor once it's gone under transformations is not permitted at this time. Please set reductions before calling split/merge/reorder/computeAt.");
+
+  std::vector<unsigned int> uint_axes;
+  for (int axis : axes) {
+    if (axis < 0)
+      axis += int(tv->nDims());
+
+    TORCH_CHECK(
+        axis >= 0 && (unsigned int)axis < tv->nDims(),
+        "Reduction on invalid axis, recieved: ",
+        axis,
+        " however tensor view only has ",
+        tv->nDims(),
+        " dims.");
+
+    uint_axes.push_back((unsigned int)axis);
+  }
+
+  Val* out = tv->newForReduction(uint_axes);
+  if (init->getDataType().value() != v1->getDataType().value())
+    init = castOp(v1->getDataType().value(), init);
+  new ReductionOp(reduction_op_type, init, out, v1);
+  return out;
+}
+
+TORCH_CUDA_API Val* sum(Val* v1, const std::vector<int>& axes) {
+  Val* init;
+  switch (v1->getDataType().value()) {
+    case (DataType::Float):
+      init = new Float(0.0);
+      break;
+    case (DataType::Int):
+      init = new Int(0);
+      break;
+    default:
+      TORCH_CHECK(
+          false,
+          "Could not generate a sum op for tensor with type: ",
+          v1->getDataType().value());
+  }
+
+  return reductionOp(BinaryOpType::Add, axes, init, v1);
+}
+
+// COMPOUND OPERATIONS
+
 TORCH_CUDA_API Val* add_alpha(Val* v1, Val* v2, Val* s) {
   TORCH_CHECK(
       s->getValType().value() == ValType::Scalar,
@@ -183,10 +284,12 @@ TORCH_CUDA_API Val* where(Val* c, Val* v1, Val* v2) {
       c->getDataType().value());
 
   Val* out = promoteNew(v1, v2);
-  Statement* expr = new TernaryOp(TernaryOpType::Where, out, c, v1, v2);
+  new TernaryOp(TernaryOpType::Where, out, c, v1, v2);
   return out;
 }
 
+// TERNARY OPERATIONS
+
 TORCH_CUDA_API Val* threshold(Val* in, Val* thresh, Val* value) {
   TORCH_CHECK(
       in->getDataType().value() == thresh->getDataType().value() &&
@@ -199,8 +302,8 @@ TORCH_CUDA_API Val* threshold(Val* in, Val* thresh, Val* value) {
       "Thresh and Value values should be Scalars");
 
   Val* out = newValLike(in);
-  Statement* expr =
-      new TernaryOp(TernaryOpType::Threshold, out, in, thresh, value);
+
+  new TernaryOp(TernaryOpType::Threshold, out, in, thresh, value);
   return out;
 }
 
@@ -216,8 +319,8 @@ TORCH_CUDA_API Val* clamp(Val* in, Val* min_val, Val* max_val) {
       "Min and Max values should be Scalars");
 
   Val* out = newValLike(in);
-  Statement* expr =
-      new TernaryOp(TernaryOpType::Clamp, out, in, min_val, max_val);
+
+  new TernaryOp(TernaryOpType::Clamp, out, in, min_val, max_val);
   return out;
 }
 
 
@@ -32,6 +32,15 @@ TORCH_CUDA_API Val* unaryOp(UnaryOpType type, Val* v1);
 // Mod, CeilDiv, and LT are considered Int only output operations for now.
 TORCH_CUDA_API Val* binaryOp(BinaryOpType type, Val* v1, Val* v2);
 
+// Perform a reduction operation on v1, initial value for reduction is init,
+// reduces across axes, and reduction operation defined by BinaryOp.
+TORCH_CUDA_API Val* reductionOp(
+    BinaryOpType reduction_op_type,
+    const std::vector<int>& axes,
+    Val* init,
+    Val* v1);
+
+// BINARY OPAERATIONS
 TORCH_CUDA_API Val* add(Val* v1, Val* v2);
 TORCH_CUDA_API Val* sub(Val* v1, Val* v2);
 TORCH_CUDA_API Val* mul(Val* v1, Val* v2);
@@ -41,12 +50,17 @@ TORCH_CUDA_API Val* lt(Val* v1, Val* v2);
 TORCH_CUDA_API Val* ceilDiv(Val* v1, Val* v2);
 TORCH_CUDA_API Val* andOp(Val* v1, Val* v2);
 
+// REDUCTION OPERATIONS
+TORCH_CUDA_API Val* sum(Val* v1, const std::vector<int>& reduction_axes);
+
+// COMPOUND OPERATIONS
 TORCH_CUDA_API Val* add_alpha(Val* v1, Val* v2, Val* s);
 TORCH_CUDA_API Val* sub_alpha(Val* v1, Val* v2, Val* s);
 TORCH_CUDA_API Val* lerp(Val* start, Val* end, Val* weight);
 TORCH_CUDA_API Val* addcmul(Val* v1, Val* v2, Val* v3, Val* s);
-
 TORCH_CUDA_API Val* where(Val* c, Val* v1, Val* v2);
+
+// TERNARY OPERATIONS
 TORCH_CUDA_API Val* threshold(Val* in, Val* thresh, Val* value);
 TORCH_CUDA_API Val* clamp(Val* in, Val* min_val, Val* max_val);