Fix flops calculation and tensor b stride calculation in the example …

…36 (NVIDIA#1278) * Fix flops calculation and tensor b stride calculation in the example 36 * Fix datatype * Update gather_scatter_fusion.cu
yf711 · Jan 8, 2024 · acba5be · acba5be
1 parent 74d1f3e
commit acba5be
Showing 1 changed file with 3 additions and 3 deletions.
diff --git a/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu b/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu
@@ -173,8 +173,8 @@ struct Options {
   /// Compute performance in GFLOP/s
   double gflops(double runtime_s) const {
 
-    // Number of real-valued multiply-adds 
-    int64_t fmas = problem_size.product();
+    // Number of real-valued multiply-adds
+    int64_t fmas = problem_size.m() * int64_t(index_size) * problem_size.k();
 
     // Two flops per multiply-add
     return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
@@ -349,7 +349,7 @@ int run(Options &options) {
       tensor_c.device_data(),             // <- reference to matrix C on device
       tensor_d_scattered.device_data(),   // <- reference to matrix D on device
       tensor_a.layout().capacity(problem_size.mk()),
-      tensor_b.layout().capacity(cutlass::make_Coord(options.index_size, problem_size.n())),
+      tensor_b.layout().capacity(cutlass::make_Coord(options.index_size, problem_size.k())),
       tensor_c.layout().capacity(problem_size.mn()),
       tensor_d_scattered.layout().capacity(problem_size.mn()),
       tensor_a.layout().stride(),