Enable a number inductor of tests on CPU (pytorch#107465)

lezcano · pytorchmergebot · commit c1cc74c7daa2 · 2023-08-20T21:44:21.000Z
There were many test that their `_cuda` variants were not running on cuda. I fixed a few of these, but I'm sure there's plenty more. It'd be great to have a way to test that we're indeed compiling something in these tests, but I don't know how to do this off the top of my head. Pull Request resolved: pytorch#107465 Approved by: https://github.com/ezyang
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -6024,19 +6024,12 @@ def fn0(i0, i1):
         def fn1(i0, i1):
             return torch.lerp(i1, i0, 70000)
 
-        def compare(fn, inputs):
-            compiled = torch._dynamo.optimize("inductor")(fn)
-            expected = fn(*inputs)
-            actual = compiled(*inputs)
-            self.assertEqual(expected, actual)
-            self.assertEqual(expected.stride(), actual.stride())
-
-        compare(fn0, [torch.rand(10, 3, 10), torch.rand(3, 10, 10)])
-        compare(fn1, [torch.rand(3, 10, 10), torch.rand(3, 10, 10)])
+        self.common(fn0, [torch.rand(10, 3, 10), torch.rand(3, 10, 10)])
+        self.common(fn1, [torch.rand(3, 10, 10), torch.rand(3, 10, 10)])
 
     def test_unspec_inputs(self):
         if self.device == "cpu":
-            raise unittest.SkipTest("segfault with CPU backend")
+            raise unittest.SkipTest("Testing mixed devices")
 
         def fn(x, y):
             return x + y, x * y, x / y
@@ -6138,9 +6131,7 @@ def fn(x):
             return attn.softmax(dim=-1)
 
         x = torch.rand(128, 32, 63)
-        res_ref = fn(x)
-        res = torch._dynamo.optimize("inductor")(fn)(x)
-        self.assertEqual(res, res_ref)
+        self.common(fn, (x,))
 
     def test_kwargs(self):
         if self.device == "cuda":
@@ -6242,9 +6233,6 @@ def fn(a, b):
         )
 
     def test_index_dynamic_shapes(self):
-        if self.device == "cuda":
-            raise unittest.SkipTest("index dynamic shapes only supports cpu")
-
         # Repro from vision_maskrcnn
         def fn(arg0_1):
             unsqueeze = arg0_1.unsqueeze(0)
@@ -6255,7 +6243,7 @@ def fn(arg0_1):
                 start=0,
                 step=1,
                 dtype=torch.int64,
-                device="cpu",
+                device=arg0_1.device,
                 requires_grad=False,
             )
             convert_element_type_1 = iota.to(torch.float32)
@@ -6267,7 +6255,7 @@ def fn(arg0_1):
                 start=0,
                 step=1,
                 dtype=torch.int64,
-                device="cpu",
+                device=arg0_1.device,
                 requires_grad=False,
             )
             convert_element_type_3 = iota_1.to(torch.float32)
@@ -6507,9 +6495,9 @@ def fn(a):
             return a[out_features.index(in_feature)]
 
         x = [
-            torch.rand([1, 256, 100, 152]),
-            torch.rand([1, 256, 50, 76]),
-            torch.rand([1, 256, 25, 38]),
+            torch.rand([1, 256, 100, 152], device=self.device),
+            torch.rand([1, 256, 50, 76], device=self.device),
+            torch.rand([1, 256, 25, 38], device=self.device),
         ]
         opt_fn = torch._dynamo.optimize("inductor")(fn)
         same(fn(x), opt_fn(x))
@@ -6521,8 +6509,7 @@ def fn(a):
             return y
 
         x = torch.rand(48, 3, 512, 512)
-        opt_fn = torch._dynamo.optimize("inductor")(fn)
-        same(fn(x), opt_fn(x))
+        self.common(fn, (x,))
 
     @unittest.skipIf(not HAS_CPU, "requires C++ compiler")
     def test_data_type_propogation(self):
@@ -6636,6 +6623,10 @@ def func(arg0_1):
                     elif node.target == "output":
                         self.assertEqual(get_data_type(node), torch.bfloat16)
 
+    # Calling div only torch.SymInt arguments is not yet supported.
+    # To support this behavior, we need to allow const-propping tensors that store symint data.
+    # For now, dynamo will explicitly graph break when it encounters user code with this behavior.
+    @expectedFailureCodegenDynamic
     def test_AllenaiLongformerBase_repro(self):
         def fn(query, scores, window_overlap):
             batch_size, seq_len, num_heads, _ = query.size()
@@ -6661,12 +6652,12 @@ def fn(query, scores, window_overlap):
             return input_tensor
 
         args = [
-            ((4, 1024, 12, 64), (768, 3072, 64, 1), torch.float32, "cpu"),
-            ((48, 3, 512, 513), (787968, 262656, 513, 1), torch.float32, "cpu"),
+            ((4, 1024, 12, 64), (768, 3072, 64, 1)),
+            ((48, 3, 512, 513), (787968, 262656, 513, 1)),
         ]
-        args = [rand_strided(sh, st, dt, dev) for (sh, st, dt, dev) in args]
-        opt_fn = torch._dynamo.optimize("inductor")(fn)
-        same(fn(*args, 256), opt_fn(*args, 256))
+        args = [rand_strided(sh, st) for (sh, st) in args]
+        args.append(256)
+        self.common(fn, args)
 
     def test_cumsum_pattern_matcher_issue(self):
         def fn(input_ids) -> torch.Tensor:
@@ -6675,25 +6666,23 @@ def fn(input_ids) -> torch.Tensor:
             batch_size, seq_length = input_shape
             past_key_values_length = 0
             mask_seq_length = past_key_values_length + seq_length
-            attention_mask = torch.ones(batch_size, mask_seq_length)
+            attention_mask = torch.ones(
+                batch_size, mask_seq_length, device=input_ids.device
+            )
             attention_mask = attention_mask.long()
             return torch.cumsum(attention_mask, dim=1)
 
-        torch._dynamo.reset()
         x = torch.randn(2, 2)
-        opt = torch._dynamo.optimize("inductor")(fn)
-        res = opt(x)
-        ref = fn(x)
-        self.assertEqual(res, ref, atol=0, rtol=0)
+        self.common(fn, (x,), atol=0, rtol=0)
 
+    # It's a view so it doens't generate a kernel
+    @expectedFailureCodegenDynamic
     def test_slice(self):
         def fn(a, b):
             return torch.ops.aten.slice.Tensor(a, 0, 0, -b)
 
-        torch._dynamo.reset()
         x = torch.rand(48, 3, 512, 512)
-        opt_fn = torch._dynamo.optimize("inductor")(fn)
-        same(fn(x, 2), opt_fn(x, 2))
+        self.common(fn, (x, 2))
 
     def test_inplace_resize_as(self):
         def fn(x, y):
diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py
@@ -46,6 +46,8 @@
 # xfail by default, set is_skip=True to skip
 test_failures = {
     "test_kwargs_dynamic_shapes": TestFailure(("cpu",)),
+    # calling div on only symint args
+    "test_AllenaiLongformerBase_repro_dynamic_shapes": TestFailure(("cpu", "cuda")),
 }
 
 if TEST_WITH_ROCM:

Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,8 @@`
`46`	`46`	`# xfail by default, set is_skip=True to skip`
`47`	`47`	`test_failures = {`
`48`	`48`	`"test_kwargs_dynamic_shapes": TestFailure(("cpu",)),`
	`49`	`+ # calling div on only symint args`
	`50`	`+ "test_AllenaiLongformerBase_repro_dynamic_shapes": TestFailure(("cpu", "cuda")),`
`49`	`51`	`}`
`50`	`52`
`51`	`53`	`if TEST_WITH_ROCM:`