From 94a992513e4360b719c56370a1bccaa7b3886f5b Mon Sep 17 00:00:00 2001
From: EmilySillars <emsillars@gmail.com>
Date: Mon, 11 Nov 2024 12:51:08 +0100
Subject: [PATCH] apply zigzag to nsnet kernels

---
 ...atch_0_matmul_transpose_b_1x400x161_f64.md | 55 ++++++++++++-
 ...tch_1_matmul_transpose_b_1x1200x400_f64.md | 13 +++-
 ...atch_7_matmul_transpose_b_1x600x400_f64.md | 78 ++++++++++++++++++-
 ...atch_8_matmul_transpose_b_1x600x600_f64.md | 70 ++++++++++++++++-
 ...atch_9_matmul_transpose_b_1x161x600_f64.md | 69 +++++++++++++++-
 tiling-nsnet/tiling-nsnet.sh                  | 12 +++
 ...ch_0_matmul_transpose_b_1x400x161_f64.yaml | 15 ++++
 ...ch_7_matmul_transpose_b_1x600x400_f64.yaml | 15 ++++
 ...ch_8_matmul_transpose_b_1x600x600_f64.yaml | 15 ++++
 ...ch_9_matmul_transpose_b_1x161x600_f64.yaml | 15 ++++
 10 files changed, 350 insertions(+), 7 deletions(-)
 create mode 100644 tiling-nsnet/tiling-nsnet.sh
 create mode 100644 zigzag/inputs/workload/dispatch_0_matmul_transpose_b_1x400x161_f64.yaml
 create mode 100644 zigzag/inputs/workload/dispatch_7_matmul_transpose_b_1x600x400_f64.yaml
 create mode 100644 zigzag/inputs/workload/dispatch_8_matmul_transpose_b_1x600x600_f64.yaml
 create mode 100644 zigzag/inputs/workload/dispatch_9_matmul_transpose_b_1x161x600_f64.yaml

diff --git a/tiling-nsnet/dispatch_0_matmul_transpose_b_1x400x161_f64.md b/tiling-nsnet/dispatch_0_matmul_transpose_b_1x400x161_f64.md
index 52151ce9..c2a80c65 100644
--- a/tiling-nsnet/dispatch_0_matmul_transpose_b_1x400x161_f64.md
+++ b/tiling-nsnet/dispatch_0_matmul_transpose_b_1x400x161_f64.md
@@ -40,10 +40,63 @@ matmul_transpose_b (I : tensor<1x161xf64, W : tensor<400x161xf64>, O : tensor<1x
     W: 0
 ```
 
+## ZigZag Run
 
+Commands run:
 
-## ZigZag Run
+```
+python main_zigzag_integration.py --model=zigzag/inputs/workload/dispatch_0_matmul_transpose_b_1x400x161_f64.yaml --mapping=zigzag/inputs/mapping/empty-mapping.yaml --accelerator=zigzag/inputs/hardware/snitch-cluster-only-floats-no-ssrs.yaml
+```
+
+```
+cat outputs/snitch-cluster-only-floats-no-ssrs-dispatch_0_matmul_transpose_b_1x400x161_f64/loop_ordering.txt
+```
+
+Output:
+
+```
+Loop ordering for dispatch_0_matmul_transpose_b_1x400x161_f64
+===========================================================================================
+Temporal Loops                    O                  W                  I                  
+===========================================================================================
+for B in [0, 7):                  l1                 l1                 l1                 
+-------------------------------------------------------------------------------------------
+  for C in [0, 5):                rf_f0_thru_f31     l1                 l1                 
+-------------------------------------------------------------------------------------------
+    for C in [0, 5):              rf_f0_thru_f31     l1                 l1                 
+-------------------------------------------------------------------------------------------
+      for C in [0, 2):            rf_f0_thru_f31     l1                 rf_f0_thru_f31     
+-------------------------------------------------------------------------------------------
+        for B in [0, 23):         rf_f0_thru_f31     l1                 rf_f0_thru_f31     
+-------------------------------------------------------------------------------------------
+===========================================================================================
+Spatial Loops                                                                              
+===========================================================================================
+          parfor C in [0, 8):                                                              
+-------------------------------------------------------------------------------------------
+          parfor A in [0, 1):                                                              
+-------------------------------------------------------------------------------------------
+```
+
+## Interpret Results
+
+Since everything fits in L1, don't tile at all?!
 
+No loop interchange either?!
 
+```
+l1Tiles[0] = 0;
+l1Tiles[1] = 0;
+l1Tiles[2] = 0;
+l1Interchange = {0, 1, 2}; 
+```
 
 ## JSON Summary
+
+```
+{
+    "bounds":[[1], [1], [1]],
+    "order":[[0,0], [1,0], [2,0]]
+}
+```
+
diff --git a/tiling-nsnet/dispatch_1_matmul_transpose_b_1x1200x400_f64.md b/tiling-nsnet/dispatch_1_matmul_transpose_b_1x1200x400_f64.md
index 936229bb..aec32208 100644
--- a/tiling-nsnet/dispatch_1_matmul_transpose_b_1x1200x400_f64.md
+++ b/tiling-nsnet/dispatch_1_matmul_transpose_b_1x1200x400_f64.md
@@ -17,7 +17,7 @@ matmul_transpose_b (I : tensor<1x400xf64, W : tensor<1200x400xf64>, O : tensor<1
     for a in [0, 1)
     for b in [0, 400)
     for c in [0, 1200)
-        O[a][b]+=I[a][c]*transpose(W)[b][c]
+                            
 }
 ```
 
@@ -100,11 +100,20 @@ for C in [0, 5):                    l1                 l3                 l1
 ```
 Loop-Dims: [A, B, C]
 Loop-Sizes: [1, 400, 1200]
-Loop-Tiles = Loop-Dims / [1, 16, 5] = [1, 400, 1200]  /  [1,  16, 5] = [1, 25, 240]
+Loop-Tiles = Loop-Sizes / [1, 16, 5] = [1, 400, 1200]  /  [1,  16, 5] = [1, 25, 240]
 Old Loop Order: A, B, C = 0, 1, 2.
 New Loop Order: A, C, B = 0, 2, 1.
 ```
 
+BUT when we feed to the upstream mlir tiling function, the transpose part of the operation has not occurred yet - need to pass tile sizes [1, 240, 25], to match the tensor size of `1200x400xf64>` ...
+
+```
+l1Tiles[0] = 0;
+l1Tiles[1] = 240;
+l1Tiles[2] = 25;
+l1Interchange = {0, 2, 1}; 
+```
+
 ## JSON Summary
 
 ```
diff --git a/tiling-nsnet/dispatch_7_matmul_transpose_b_1x600x400_f64.md b/tiling-nsnet/dispatch_7_matmul_transpose_b_1x600x400_f64.md
index 6bb1859e..f696a604 100644
--- a/tiling-nsnet/dispatch_7_matmul_transpose_b_1x600x400_f64.md
+++ b/tiling-nsnet/dispatch_7_matmul_transpose_b_1x600x400_f64.md
@@ -41,11 +41,87 @@ matmul_transpose_b (I : tensor<1x400xf64, W : tensor<600x400xf64>, O : tensor<1x
     W: 0
 ```
 
+## ZigZag Run
 
+Commands Run:
 
-## ZigZag Run
+```
+sh tiling-nsnet.sh dispatch_7_matmul_transpose_b_1x600x400_f64
+```
+
+Relevant Output:
+
+```
+Loop ordering for dispatch_7_matmul_transpose_b_1x600x400_f64
+=============================================================================================
+Temporal Loops                      O                  W                  I                  
+=============================================================================================
+for B in [0, 2):                    l1                 l3                 l1                 
+---------------------------------------------------------------------------------------------
+  for C in [0, 20):                 rf_f0_thru_f31     l3                 l1                 
+---------------------------------------------------------------------------------------------
+    for C in [0, 6):                rf_f0_thru_f31     l1                 l1                 
+---------------------------------------------------------------------------------------------
+      for C in [0, 5):              rf_f0_thru_f31     l1                 rf_f0_thru_f31     
+---------------------------------------------------------------------------------------------
+        for B in [0, 5):            rf_f0_thru_f31     l1                 rf_f0_thru_f31     
+---------------------------------------------------------------------------------------------
+          for B in [0, 5):          rf_f0_thru_f31     l1                 rf_f0_thru_f31     
+---------------------------------------------------------------------------------------------
+=============================================================================================
+Spatial Loops                                                                                
+=============================================================================================
+            parfor B in [0, 8):                                                              
+---------------------------------------------------------------------------------------------
+            parfor C in [0, 1):                                                              
+---------------------------------------------------------------------------------------------
+```
+
+## Interpret Results
+
+Only tile to L1
+
+```
+Loop ordering for dispatch_7_matmul_transpose_b_1x600x400_f64
+=============================================================================================
+Temporal Loops                      O                  W                  I                  
+=============================================================================================
+for B in [0, 2):                    l1                 l3                 l1                 
+---------------------------------------------------------------------------------------------
+  for C in [0, 20):                 rf_f0_thru_f31     l3                 l1         
+```
+
+Recall operand sizes: `I : tensor<1x400xf64, W : tensor<600x400xf64>, O : tensor<1x600xf64>)`
+
+Recall mac operation inside loops: `O[a][b]+=I[a][c]*transpose(W)[b][c]`
+
+```
+loop_dims: [A, B, C]
+loop_sizes: [1, 500, 600]
+NEW Loop Bounds: [1, 2, 20]
+Tile sizes = loop_sizes / new_loop_bounds = [1, 500, 600] / [1, 2, 20] = [1, 250, 300]
+
+original loop order: [A, B, C] = [0, 1, 2]
+new loop order: [0, 1, 2] (no change)
+```
+
+BUT, when we give the tile sizes to the upstream mlir tiling function, we need to give the tile sizes in the order [1, 300, 350] because the weight matrix is not yet transposed (need the tile size of `300` to match up with the `600` inside `tensor<600x400xf64>`).
+
+```
+l1Tiles[0] = 0;
+l1Tiles[1] = 300;
+l1Tiles[2] = 250;
+l1Interchange = {0, 1, 2}; 
+```
 
 
 
 ## JSON Summary
 
+```
+{
+    "bounds":[[1], [2], [20]],
+    "order":[[0,0], [1,0], [2,0]]
+}
+```
+
diff --git a/tiling-nsnet/dispatch_8_matmul_transpose_b_1x600x600_f64.md b/tiling-nsnet/dispatch_8_matmul_transpose_b_1x600x600_f64.md
index 79111562..cf4437d7 100644
--- a/tiling-nsnet/dispatch_8_matmul_transpose_b_1x600x600_f64.md
+++ b/tiling-nsnet/dispatch_8_matmul_transpose_b_1x600x600_f64.md
@@ -41,11 +41,79 @@ matmul_transpose_b (I : tensor<1x600xf64, W : tensor<600x600xf64>, O : tensor<1x
     W: 0
 ```
 
+## ZigZag Run
 
+Command Run:
 
-## ZigZag Run
+```
+sh tiling-nsnet.sh dispatch_8_matmul_transpose_b_1x600x600_f64
+```
+
+Relevant Output:
+
+```
+Loop ordering for dispatch_8_matmul_transpose_b_1x600x600_f64
+=============================================================================================
+Temporal Loops                      O                  W                  I                  
+=============================================================================================
+for B in [0, 3):                    l1                 l3                 l1                 
+---------------------------------------------------------------------------------------------
+  for C in [0, 20):                 rf_f0_thru_f31     l3                 l1                 
+---------------------------------------------------------------------------------------------
+    for C in [0, 6):                rf_f0_thru_f31     l1                 l1                 
+---------------------------------------------------------------------------------------------
+      for C in [0, 5):              rf_f0_thru_f31     l1                 rf_f0_thru_f31     
+---------------------------------------------------------------------------------------------
+        for B in [0, 5):            rf_f0_thru_f31     l1                 rf_f0_thru_f31     
+---------------------------------------------------------------------------------------------
+          for B in [0, 5):          rf_f0_thru_f31     l1                 rf_f0_thru_f31     
+---------------------------------------------------------------------------------------------
+=============================================================================================
+Spatial Loops                                                                                
+=============================================================================================
+            parfor B in [0, 8):                                                              
+---------------------------------------------------------------------------------------------
+            parfor A in [0, 1):                                                              
+---------------------------------------------------------------------------------------------
+```
+
+## Interpret Results
+
+Only tile to L1 level:
+
+```
+Loop ordering for dispatch_8_matmul_transpose_b_1x600x600_f64
+=============================================================================================
+Temporal Loops                      O                  W                  I                  
+=============================================================================================
+for B in [0, 3):                    l1                 l3                 l1                 
+---------------------------------------------------------------------------------------------
+  for C in [0, 20):                 rf_f0_thru_f31     l3                 l1  
+```
 
+```
+loop_dims: [A,B,C]
+loop_sizes: [1, 600, 600]
+new_loop_bounds = [1, 3, 20]
+tile_sizes = loop_size / loop_bounds = [1, 600, 600] / [1, 3, 20] = [1, 200, 30]
+No loop interchange.
+```
 
+We need to swap the `200` and `30` because of the transpose:
+
+```
+l1Tiles[0] = 0;
+l1Tiles[1] = 30;
+l1Tiles[2] = 200;
+l1Interchange = {0, 1, 2}; 
+```
 
 ## JSON Summary
 
+```
+{
+    "bounds":[[1], [3], [20]],
+    "order":[[0,0], [1,0], [2,0]]
+}
+```
+
diff --git a/tiling-nsnet/dispatch_9_matmul_transpose_b_1x161x600_f64.md b/tiling-nsnet/dispatch_9_matmul_transpose_b_1x161x600_f64.md
index b6ef9e40..69083d4a 100644
--- a/tiling-nsnet/dispatch_9_matmul_transpose_b_1x161x600_f64.md
+++ b/tiling-nsnet/dispatch_9_matmul_transpose_b_1x161x600_f64.md
@@ -31,7 +31,7 @@ matmul_transpose_b (I : tensor<1x600xf64, W : tensor<161x600xf64>, O : tensor<1x
   equation: O[a][b]+=I[a][c]*W[b][c]
   dimension_relations: []
   loop_dims: [A,B,C]
-  loop_sizes: [1, 600, 1200] 
+  loop_sizes: [1, 600, 161] 
   operand_precision:
     W: 64
     I: 64
@@ -42,11 +42,76 @@ matmul_transpose_b (I : tensor<1x600xf64, W : tensor<161x600xf64>, O : tensor<1x
     W: 0
 ```
 
+## ZigZag Run
 
+Command Run:
 
-## ZigZag Run
+```
+sh tiling-nsnet.sh dispatch_9_matmul_transpose_b_1x161x600_f64
+```
+
+Relevant Output:
+
+```
+Loop ordering for dispatch_9_matmul_transpose_b_1x161x600_f64
+=============================================================================================
+Temporal Loops                      O                  W                  I                  
+=============================================================================================
+for B in [0, 6):                    l1                 l3                 l1                 
+---------------------------------------------------------------------------------------------
+  for B in [0, 4):                  l1                 l1                 l1                 
+---------------------------------------------------------------------------------------------
+    for C in [0, 7):                rf_f0_thru_f31     l1                 l1                 
+---------------------------------------------------------------------------------------------
+      for C in [0, 3):              rf_f0_thru_f31     l1                 rf_f0_thru_f31     
+---------------------------------------------------------------------------------------------
+        for B in [0, 5):            rf_f0_thru_f31     l1                 rf_f0_thru_f31     
+---------------------------------------------------------------------------------------------
+          for B in [0, 5):          rf_f0_thru_f31     l1                 rf_f0_thru_f31     
+---------------------------------------------------------------------------------------------
+=============================================================================================
+Spatial Loops                                                                                
+=============================================================================================
+            parfor C in [0, 7):                                                              
+---------------------------------------------------------------------------------------------
+            parfor A in [0, 1):                                                              
+---------------------------------------------------------------------------------------------
+```
+
+## Interpret Results
+
+Tile only to L1:
+
+```
+Loop ordering for dispatch_9_matmul_transpose_b_1x161x600_f64
+=============================================================================================
+Temporal Loops                      O                  W                  I                  
+=============================================================================================
+for B in [0, 6):                    l1                 l3                 l1     
+```
 
+```
+loop_dims: [A,B,C]
+loop_sizes: [1, 600, 161] 
+new_loop_bounds = [1, 6, 1]
+tile_sizes = loop_sizes / new_loop_bounds = [1, 600, 161] / [1, 6, 1] = [1, 100, 161]
+```
 
+Need to swap `100` and `161` to match tensor  shape `tensor<161x600xf64>`:
+
+```
+l1Tiles[0] = 0;
+l1Tiles[1] = 161;
+l1Tiles[2] = 100;
+l1Interchange = {0, 1, 2}; 
+```
 
 ## JSON Summary
 
+```
+{
+    "bounds":[[1], [6], [1]],
+    "order":[[0,0], [1,0], [2,0]]
+}
+```
+
diff --git a/tiling-nsnet/tiling-nsnet.sh b/tiling-nsnet/tiling-nsnet.sh
new file mode 100644
index 00000000..0cf3062c
--- /dev/null
+++ b/tiling-nsnet/tiling-nsnet.sh
@@ -0,0 +1,12 @@
+echo $1
+cd ..
+
+rm -r outputs/snitch-cluster-only-floats-no-ssrs-$1
+
+python main_zigzag_integration.py \
+--model=zigzag/inputs/workload/$1.yaml \
+--mapping=zigzag/inputs/mapping/empty-mapping.yaml \
+--accelerator=zigzag/inputs/hardware/snitch-cluster-only-floats-no-ssrs.yaml && \
+cat outputs/snitch-cluster-only-floats-no-ssrs-$1/loop_ordering.txt
+
+cd tiling-nsnet
\ No newline at end of file
diff --git a/zigzag/inputs/workload/dispatch_0_matmul_transpose_b_1x400x161_f64.yaml b/zigzag/inputs/workload/dispatch_0_matmul_transpose_b_1x400x161_f64.yaml
new file mode 100644
index 00000000..cd97b05f
--- /dev/null
+++ b/zigzag/inputs/workload/dispatch_0_matmul_transpose_b_1x400x161_f64.yaml
@@ -0,0 +1,15 @@
+- id: 0 
+  name: dispatch_0_matmul_transpose_b_1x400x161_f64
+  operator_type: Matmul_transpose_b
+  equation: O[a][b]+=I[a][c]*W[b][c]
+  dimension_relations: []
+  loop_dims: [A,B,C]
+  loop_sizes: [1, 161, 400]
+  operand_precision:
+    W: 64
+    I: 64
+    O: 64
+    O_final: 64
+  operand_source:
+    I: 0
+    W: 0
\ No newline at end of file
diff --git a/zigzag/inputs/workload/dispatch_7_matmul_transpose_b_1x600x400_f64.yaml b/zigzag/inputs/workload/dispatch_7_matmul_transpose_b_1x600x400_f64.yaml
new file mode 100644
index 00000000..74347803
--- /dev/null
+++ b/zigzag/inputs/workload/dispatch_7_matmul_transpose_b_1x600x400_f64.yaml
@@ -0,0 +1,15 @@
+- id: 0 
+  name: dispatch_7_matmul_transpose_b_1x600x400_f64
+  operator_type: Matmul_transpose_b
+  equation: O[a][b]+=I[a][c]*W[b][c]
+  dimension_relations: []
+  loop_dims: [A,B,C]
+  loop_sizes: [1, 400, 600] 
+  operand_precision:
+    W: 64
+    I: 64
+    O: 64
+    O_final: 64
+  operand_source:
+    I: 0
+    W: 0
\ No newline at end of file
diff --git a/zigzag/inputs/workload/dispatch_8_matmul_transpose_b_1x600x600_f64.yaml b/zigzag/inputs/workload/dispatch_8_matmul_transpose_b_1x600x600_f64.yaml
new file mode 100644
index 00000000..aca84729
--- /dev/null
+++ b/zigzag/inputs/workload/dispatch_8_matmul_transpose_b_1x600x600_f64.yaml
@@ -0,0 +1,15 @@
+- id: 0 
+  name: dispatch_8_matmul_transpose_b_1x600x600_f64
+  operator_type: Matmul_transpose_b
+  equation: O[a][b]+=I[a][c]*W[b][c]
+  dimension_relations: []
+  loop_dims: [A,B,C]
+  loop_sizes: [1, 600, 600] 
+  operand_precision:
+    W: 64
+    I: 64
+    O: 64
+    O_final: 64
+  operand_source:
+    I: 0
+    W: 0
\ No newline at end of file
diff --git a/zigzag/inputs/workload/dispatch_9_matmul_transpose_b_1x161x600_f64.yaml b/zigzag/inputs/workload/dispatch_9_matmul_transpose_b_1x161x600_f64.yaml
new file mode 100644
index 00000000..ad2e1b8b
--- /dev/null
+++ b/zigzag/inputs/workload/dispatch_9_matmul_transpose_b_1x161x600_f64.yaml
@@ -0,0 +1,15 @@
+- id: 0 
+  name: dispatch_9_matmul_transpose_b_1x161x600_f64
+  operator_type: Matmul_transpose_b
+  equation: O[a][b]+=I[a][c]*W[b][c]
+  dimension_relations: []
+  loop_dims: [A,B,C]
+  loop_sizes: [1, 600, 161] 
+  operand_precision:
+    W: 64
+    I: 64
+    O: 64
+    O_final: 64
+  operand_source:
+    I: 0
+    W: 0
\ No newline at end of file