From 94a992513e4360b719c56370a1bccaa7b3886f5b Mon Sep 17 00:00:00 2001 From: EmilySillars Date: Mon, 11 Nov 2024 12:51:08 +0100 Subject: [PATCH] apply zigzag to nsnet kernels --- ...atch_0_matmul_transpose_b_1x400x161_f64.md | 55 ++++++++++++- ...tch_1_matmul_transpose_b_1x1200x400_f64.md | 13 +++- ...atch_7_matmul_transpose_b_1x600x400_f64.md | 78 ++++++++++++++++++- ...atch_8_matmul_transpose_b_1x600x600_f64.md | 70 ++++++++++++++++- ...atch_9_matmul_transpose_b_1x161x600_f64.md | 69 +++++++++++++++- tiling-nsnet/tiling-nsnet.sh | 12 +++ ...ch_0_matmul_transpose_b_1x400x161_f64.yaml | 15 ++++ ...ch_7_matmul_transpose_b_1x600x400_f64.yaml | 15 ++++ ...ch_8_matmul_transpose_b_1x600x600_f64.yaml | 15 ++++ ...ch_9_matmul_transpose_b_1x161x600_f64.yaml | 15 ++++ 10 files changed, 350 insertions(+), 7 deletions(-) create mode 100644 tiling-nsnet/tiling-nsnet.sh create mode 100644 zigzag/inputs/workload/dispatch_0_matmul_transpose_b_1x400x161_f64.yaml create mode 100644 zigzag/inputs/workload/dispatch_7_matmul_transpose_b_1x600x400_f64.yaml create mode 100644 zigzag/inputs/workload/dispatch_8_matmul_transpose_b_1x600x600_f64.yaml create mode 100644 zigzag/inputs/workload/dispatch_9_matmul_transpose_b_1x161x600_f64.yaml diff --git a/tiling-nsnet/dispatch_0_matmul_transpose_b_1x400x161_f64.md b/tiling-nsnet/dispatch_0_matmul_transpose_b_1x400x161_f64.md index 52151ce9..c2a80c65 100644 --- a/tiling-nsnet/dispatch_0_matmul_transpose_b_1x400x161_f64.md +++ b/tiling-nsnet/dispatch_0_matmul_transpose_b_1x400x161_f64.md @@ -40,10 +40,63 @@ matmul_transpose_b (I : tensor<1x161xf64, W : tensor<400x161xf64>, O : tensor<1x W: 0 ``` +## ZigZag Run +Commands run: -## ZigZag Run +``` +python main_zigzag_integration.py --model=zigzag/inputs/workload/dispatch_0_matmul_transpose_b_1x400x161_f64.yaml --mapping=zigzag/inputs/mapping/empty-mapping.yaml --accelerator=zigzag/inputs/hardware/snitch-cluster-only-floats-no-ssrs.yaml +``` + +``` +cat outputs/snitch-cluster-only-floats-no-ssrs-dispatch_0_matmul_transpose_b_1x400x161_f64/loop_ordering.txt +``` + +Output: + +``` +Loop ordering for dispatch_0_matmul_transpose_b_1x400x161_f64 +=========================================================================================== +Temporal Loops O W I +=========================================================================================== +for B in [0, 7): l1 l1 l1 +------------------------------------------------------------------------------------------- + for C in [0, 5): rf_f0_thru_f31 l1 l1 +------------------------------------------------------------------------------------------- + for C in [0, 5): rf_f0_thru_f31 l1 l1 +------------------------------------------------------------------------------------------- + for C in [0, 2): rf_f0_thru_f31 l1 rf_f0_thru_f31 +------------------------------------------------------------------------------------------- + for B in [0, 23): rf_f0_thru_f31 l1 rf_f0_thru_f31 +------------------------------------------------------------------------------------------- +=========================================================================================== +Spatial Loops +=========================================================================================== + parfor C in [0, 8): +------------------------------------------------------------------------------------------- + parfor A in [0, 1): +------------------------------------------------------------------------------------------- +``` + +## Interpret Results + +Since everything fits in L1, don't tile at all?! +No loop interchange either?! +``` +l1Tiles[0] = 0; +l1Tiles[1] = 0; +l1Tiles[2] = 0; +l1Interchange = {0, 1, 2}; +``` ## JSON Summary + +``` +{ + "bounds":[[1], [1], [1]], + "order":[[0,0], [1,0], [2,0]] +} +``` + diff --git a/tiling-nsnet/dispatch_1_matmul_transpose_b_1x1200x400_f64.md b/tiling-nsnet/dispatch_1_matmul_transpose_b_1x1200x400_f64.md index 936229bb..aec32208 100644 --- a/tiling-nsnet/dispatch_1_matmul_transpose_b_1x1200x400_f64.md +++ b/tiling-nsnet/dispatch_1_matmul_transpose_b_1x1200x400_f64.md @@ -17,7 +17,7 @@ matmul_transpose_b (I : tensor<1x400xf64, W : tensor<1200x400xf64>, O : tensor<1 for a in [0, 1) for b in [0, 400) for c in [0, 1200) - O[a][b]+=I[a][c]*transpose(W)[b][c] + } ``` @@ -100,11 +100,20 @@ for C in [0, 5): l1 l3 l1 ``` Loop-Dims: [A, B, C] Loop-Sizes: [1, 400, 1200] -Loop-Tiles = Loop-Dims / [1, 16, 5] = [1, 400, 1200] / [1, 16, 5] = [1, 25, 240] +Loop-Tiles = Loop-Sizes / [1, 16, 5] = [1, 400, 1200] / [1, 16, 5] = [1, 25, 240] Old Loop Order: A, B, C = 0, 1, 2. New Loop Order: A, C, B = 0, 2, 1. ``` +BUT when we feed to the upstream mlir tiling function, the transpose part of the operation has not occurred yet - need to pass tile sizes [1, 240, 25], to match the tensor size of `1200x400xf64>` ... + +``` +l1Tiles[0] = 0; +l1Tiles[1] = 240; +l1Tiles[2] = 25; +l1Interchange = {0, 2, 1}; +``` + ## JSON Summary ``` diff --git a/tiling-nsnet/dispatch_7_matmul_transpose_b_1x600x400_f64.md b/tiling-nsnet/dispatch_7_matmul_transpose_b_1x600x400_f64.md index 6bb1859e..f696a604 100644 --- a/tiling-nsnet/dispatch_7_matmul_transpose_b_1x600x400_f64.md +++ b/tiling-nsnet/dispatch_7_matmul_transpose_b_1x600x400_f64.md @@ -41,11 +41,87 @@ matmul_transpose_b (I : tensor<1x400xf64, W : tensor<600x400xf64>, O : tensor<1x W: 0 ``` +## ZigZag Run +Commands Run: -## ZigZag Run +``` +sh tiling-nsnet.sh dispatch_7_matmul_transpose_b_1x600x400_f64 +``` + +Relevant Output: + +``` +Loop ordering for dispatch_7_matmul_transpose_b_1x600x400_f64 +============================================================================================= +Temporal Loops O W I +============================================================================================= +for B in [0, 2): l1 l3 l1 +--------------------------------------------------------------------------------------------- + for C in [0, 20): rf_f0_thru_f31 l3 l1 +--------------------------------------------------------------------------------------------- + for C in [0, 6): rf_f0_thru_f31 l1 l1 +--------------------------------------------------------------------------------------------- + for C in [0, 5): rf_f0_thru_f31 l1 rf_f0_thru_f31 +--------------------------------------------------------------------------------------------- + for B in [0, 5): rf_f0_thru_f31 l1 rf_f0_thru_f31 +--------------------------------------------------------------------------------------------- + for B in [0, 5): rf_f0_thru_f31 l1 rf_f0_thru_f31 +--------------------------------------------------------------------------------------------- +============================================================================================= +Spatial Loops +============================================================================================= + parfor B in [0, 8): +--------------------------------------------------------------------------------------------- + parfor C in [0, 1): +--------------------------------------------------------------------------------------------- +``` + +## Interpret Results + +Only tile to L1 + +``` +Loop ordering for dispatch_7_matmul_transpose_b_1x600x400_f64 +============================================================================================= +Temporal Loops O W I +============================================================================================= +for B in [0, 2): l1 l3 l1 +--------------------------------------------------------------------------------------------- + for C in [0, 20): rf_f0_thru_f31 l3 l1 +``` + +Recall operand sizes: `I : tensor<1x400xf64, W : tensor<600x400xf64>, O : tensor<1x600xf64>)` + +Recall mac operation inside loops: `O[a][b]+=I[a][c]*transpose(W)[b][c]` + +``` +loop_dims: [A, B, C] +loop_sizes: [1, 500, 600] +NEW Loop Bounds: [1, 2, 20] +Tile sizes = loop_sizes / new_loop_bounds = [1, 500, 600] / [1, 2, 20] = [1, 250, 300] + +original loop order: [A, B, C] = [0, 1, 2] +new loop order: [0, 1, 2] (no change) +``` + +BUT, when we give the tile sizes to the upstream mlir tiling function, we need to give the tile sizes in the order [1, 300, 350] because the weight matrix is not yet transposed (need the tile size of `300` to match up with the `600` inside `tensor<600x400xf64>`). + +``` +l1Tiles[0] = 0; +l1Tiles[1] = 300; +l1Tiles[2] = 250; +l1Interchange = {0, 1, 2}; +``` ## JSON Summary +``` +{ + "bounds":[[1], [2], [20]], + "order":[[0,0], [1,0], [2,0]] +} +``` + diff --git a/tiling-nsnet/dispatch_8_matmul_transpose_b_1x600x600_f64.md b/tiling-nsnet/dispatch_8_matmul_transpose_b_1x600x600_f64.md index 79111562..cf4437d7 100644 --- a/tiling-nsnet/dispatch_8_matmul_transpose_b_1x600x600_f64.md +++ b/tiling-nsnet/dispatch_8_matmul_transpose_b_1x600x600_f64.md @@ -41,11 +41,79 @@ matmul_transpose_b (I : tensor<1x600xf64, W : tensor<600x600xf64>, O : tensor<1x W: 0 ``` +## ZigZag Run +Command Run: -## ZigZag Run +``` +sh tiling-nsnet.sh dispatch_8_matmul_transpose_b_1x600x600_f64 +``` + +Relevant Output: + +``` +Loop ordering for dispatch_8_matmul_transpose_b_1x600x600_f64 +============================================================================================= +Temporal Loops O W I +============================================================================================= +for B in [0, 3): l1 l3 l1 +--------------------------------------------------------------------------------------------- + for C in [0, 20): rf_f0_thru_f31 l3 l1 +--------------------------------------------------------------------------------------------- + for C in [0, 6): rf_f0_thru_f31 l1 l1 +--------------------------------------------------------------------------------------------- + for C in [0, 5): rf_f0_thru_f31 l1 rf_f0_thru_f31 +--------------------------------------------------------------------------------------------- + for B in [0, 5): rf_f0_thru_f31 l1 rf_f0_thru_f31 +--------------------------------------------------------------------------------------------- + for B in [0, 5): rf_f0_thru_f31 l1 rf_f0_thru_f31 +--------------------------------------------------------------------------------------------- +============================================================================================= +Spatial Loops +============================================================================================= + parfor B in [0, 8): +--------------------------------------------------------------------------------------------- + parfor A in [0, 1): +--------------------------------------------------------------------------------------------- +``` + +## Interpret Results + +Only tile to L1 level: + +``` +Loop ordering for dispatch_8_matmul_transpose_b_1x600x600_f64 +============================================================================================= +Temporal Loops O W I +============================================================================================= +for B in [0, 3): l1 l3 l1 +--------------------------------------------------------------------------------------------- + for C in [0, 20): rf_f0_thru_f31 l3 l1 +``` +``` +loop_dims: [A,B,C] +loop_sizes: [1, 600, 600] +new_loop_bounds = [1, 3, 20] +tile_sizes = loop_size / loop_bounds = [1, 600, 600] / [1, 3, 20] = [1, 200, 30] +No loop interchange. +``` +We need to swap the `200` and `30` because of the transpose: + +``` +l1Tiles[0] = 0; +l1Tiles[1] = 30; +l1Tiles[2] = 200; +l1Interchange = {0, 1, 2}; +``` ## JSON Summary +``` +{ + "bounds":[[1], [3], [20]], + "order":[[0,0], [1,0], [2,0]] +} +``` + diff --git a/tiling-nsnet/dispatch_9_matmul_transpose_b_1x161x600_f64.md b/tiling-nsnet/dispatch_9_matmul_transpose_b_1x161x600_f64.md index b6ef9e40..69083d4a 100644 --- a/tiling-nsnet/dispatch_9_matmul_transpose_b_1x161x600_f64.md +++ b/tiling-nsnet/dispatch_9_matmul_transpose_b_1x161x600_f64.md @@ -31,7 +31,7 @@ matmul_transpose_b (I : tensor<1x600xf64, W : tensor<161x600xf64>, O : tensor<1x equation: O[a][b]+=I[a][c]*W[b][c] dimension_relations: [] loop_dims: [A,B,C] - loop_sizes: [1, 600, 1200] + loop_sizes: [1, 600, 161] operand_precision: W: 64 I: 64 @@ -42,11 +42,76 @@ matmul_transpose_b (I : tensor<1x600xf64, W : tensor<161x600xf64>, O : tensor<1x W: 0 ``` +## ZigZag Run +Command Run: -## ZigZag Run +``` +sh tiling-nsnet.sh dispatch_9_matmul_transpose_b_1x161x600_f64 +``` + +Relevant Output: + +``` +Loop ordering for dispatch_9_matmul_transpose_b_1x161x600_f64 +============================================================================================= +Temporal Loops O W I +============================================================================================= +for B in [0, 6): l1 l3 l1 +--------------------------------------------------------------------------------------------- + for B in [0, 4): l1 l1 l1 +--------------------------------------------------------------------------------------------- + for C in [0, 7): rf_f0_thru_f31 l1 l1 +--------------------------------------------------------------------------------------------- + for C in [0, 3): rf_f0_thru_f31 l1 rf_f0_thru_f31 +--------------------------------------------------------------------------------------------- + for B in [0, 5): rf_f0_thru_f31 l1 rf_f0_thru_f31 +--------------------------------------------------------------------------------------------- + for B in [0, 5): rf_f0_thru_f31 l1 rf_f0_thru_f31 +--------------------------------------------------------------------------------------------- +============================================================================================= +Spatial Loops +============================================================================================= + parfor C in [0, 7): +--------------------------------------------------------------------------------------------- + parfor A in [0, 1): +--------------------------------------------------------------------------------------------- +``` + +## Interpret Results + +Tile only to L1: + +``` +Loop ordering for dispatch_9_matmul_transpose_b_1x161x600_f64 +============================================================================================= +Temporal Loops O W I +============================================================================================= +for B in [0, 6): l1 l3 l1 +``` +``` +loop_dims: [A,B,C] +loop_sizes: [1, 600, 161] +new_loop_bounds = [1, 6, 1] +tile_sizes = loop_sizes / new_loop_bounds = [1, 600, 161] / [1, 6, 1] = [1, 100, 161] +``` +Need to swap `100` and `161` to match tensor shape `tensor<161x600xf64>`: + +``` +l1Tiles[0] = 0; +l1Tiles[1] = 161; +l1Tiles[2] = 100; +l1Interchange = {0, 1, 2}; +``` ## JSON Summary +``` +{ + "bounds":[[1], [6], [1]], + "order":[[0,0], [1,0], [2,0]] +} +``` + diff --git a/tiling-nsnet/tiling-nsnet.sh b/tiling-nsnet/tiling-nsnet.sh new file mode 100644 index 00000000..0cf3062c --- /dev/null +++ b/tiling-nsnet/tiling-nsnet.sh @@ -0,0 +1,12 @@ +echo $1 +cd .. + +rm -r outputs/snitch-cluster-only-floats-no-ssrs-$1 + +python main_zigzag_integration.py \ +--model=zigzag/inputs/workload/$1.yaml \ +--mapping=zigzag/inputs/mapping/empty-mapping.yaml \ +--accelerator=zigzag/inputs/hardware/snitch-cluster-only-floats-no-ssrs.yaml && \ +cat outputs/snitch-cluster-only-floats-no-ssrs-$1/loop_ordering.txt + +cd tiling-nsnet \ No newline at end of file diff --git a/zigzag/inputs/workload/dispatch_0_matmul_transpose_b_1x400x161_f64.yaml b/zigzag/inputs/workload/dispatch_0_matmul_transpose_b_1x400x161_f64.yaml new file mode 100644 index 00000000..cd97b05f --- /dev/null +++ b/zigzag/inputs/workload/dispatch_0_matmul_transpose_b_1x400x161_f64.yaml @@ -0,0 +1,15 @@ +- id: 0 + name: dispatch_0_matmul_transpose_b_1x400x161_f64 + operator_type: Matmul_transpose_b + equation: O[a][b]+=I[a][c]*W[b][c] + dimension_relations: [] + loop_dims: [A,B,C] + loop_sizes: [1, 161, 400] + operand_precision: + W: 64 + I: 64 + O: 64 + O_final: 64 + operand_source: + I: 0 + W: 0 \ No newline at end of file diff --git a/zigzag/inputs/workload/dispatch_7_matmul_transpose_b_1x600x400_f64.yaml b/zigzag/inputs/workload/dispatch_7_matmul_transpose_b_1x600x400_f64.yaml new file mode 100644 index 00000000..74347803 --- /dev/null +++ b/zigzag/inputs/workload/dispatch_7_matmul_transpose_b_1x600x400_f64.yaml @@ -0,0 +1,15 @@ +- id: 0 + name: dispatch_7_matmul_transpose_b_1x600x400_f64 + operator_type: Matmul_transpose_b + equation: O[a][b]+=I[a][c]*W[b][c] + dimension_relations: [] + loop_dims: [A,B,C] + loop_sizes: [1, 400, 600] + operand_precision: + W: 64 + I: 64 + O: 64 + O_final: 64 + operand_source: + I: 0 + W: 0 \ No newline at end of file diff --git a/zigzag/inputs/workload/dispatch_8_matmul_transpose_b_1x600x600_f64.yaml b/zigzag/inputs/workload/dispatch_8_matmul_transpose_b_1x600x600_f64.yaml new file mode 100644 index 00000000..aca84729 --- /dev/null +++ b/zigzag/inputs/workload/dispatch_8_matmul_transpose_b_1x600x600_f64.yaml @@ -0,0 +1,15 @@ +- id: 0 + name: dispatch_8_matmul_transpose_b_1x600x600_f64 + operator_type: Matmul_transpose_b + equation: O[a][b]+=I[a][c]*W[b][c] + dimension_relations: [] + loop_dims: [A,B,C] + loop_sizes: [1, 600, 600] + operand_precision: + W: 64 + I: 64 + O: 64 + O_final: 64 + operand_source: + I: 0 + W: 0 \ No newline at end of file diff --git a/zigzag/inputs/workload/dispatch_9_matmul_transpose_b_1x161x600_f64.yaml b/zigzag/inputs/workload/dispatch_9_matmul_transpose_b_1x161x600_f64.yaml new file mode 100644 index 00000000..ad2e1b8b --- /dev/null +++ b/zigzag/inputs/workload/dispatch_9_matmul_transpose_b_1x161x600_f64.yaml @@ -0,0 +1,15 @@ +- id: 0 + name: dispatch_9_matmul_transpose_b_1x161x600_f64 + operator_type: Matmul_transpose_b + equation: O[a][b]+=I[a][c]*W[b][c] + dimension_relations: [] + loop_dims: [A,B,C] + loop_sizes: [1, 600, 161] + operand_precision: + W: 64 + I: 64 + O: 64 + O_final: 64 + operand_source: + I: 0 + W: 0 \ No newline at end of file