update README (add e2k info)

Add explanations and notes. Sort tests from most performant.
strongtz · May 10, 2024 · 1d26355 · 1d26355
1 parent b73e319
commit 1d26355
Show file tree

Hide file tree

Showing 4 changed files with 111 additions and 46 deletions.
diff --git a/README.md b/README.md
@@ -6,11 +6,13 @@ It can automatically sense the local SIMD|DSA ISAs while compiling.
 
 ## Support OS and ISA
 
-|OS|x86-64|arm64|riscv64|loongarch64|
-| ------------ | ------------ | ------------ | ------------ | ------------ |
-|Linux|yes|yes|yes|yes|
-|MacOS|no|no|no|no|
-|Windows|no|no|no|no|
+| Arch          |Linux| MacOS| Windows|
+|:--------------|:---:|:----:|:------:|
+| arm64         | yes |  no  |   no   |
+| e2k           | yes |  no  |   no   |
+| loongarch64   | yes |  no  |   no   |
+| riscv64       | yes |  no  |   no   |
+| x86-64        | yes |  no  |   no   |
 
 ## Support x86-64 SIMD|DSA ISA
 
@@ -55,6 +57,46 @@ NOTE: ime is a SpacemiT custom vendor extension.
 |SIMD|LSX|Vector|fp32/fp64|From Loongson 3A5000|
 |Scalar|FP|Scalar|fp32/fp64|From Loongson 3A5000|
 
+## Support e2k ISA
+
+| Arch |  ISA  |Feature| Vector Width | Data Type |Description
+|:-----|:------|:-----:|:------------:|----------:|:----------
+| SIMD | v6    | Vector|          128 | fp32/fp64 | FMA
+| SIMD | v5    | Vector|          128 | fp32/fp64 | Combined operations
+|Scalar| v1-v4 | Scalar|              |      fp64 | Combined operations
+| SIMD | v1-v4 | Vector|           64 |      fp32 | Combined operations
+
+### Combined operations
+
+E2K has support for instructions that perform two independant operations.
+It is like FMA, but with additional rounding as these operations is independant.
+
+#### Example `fmul_addd`
+
+```
+fmul_addd src1, src2, src3, dst
+```
+
+##### Description
+
+Multiply double-precision (64-bit) floating-point values from `src1` and `src2`,
+and add the intermediate result to value from `src3`. Store the result in `dst`.
+
+##### Operation
+
+```
+dst[63:0] := src3[63:0] + src1[63:0] * src2[63:0]
+```
+
+##### Latency and Throughput
+
+| Architecture  | Latency | Throughput (CPI) | ALC
+|:--------------|:-------:|:----------------:|:---:
+| elbrus-v4     |    8    |       0.16       | `012345`
+| elbrus-v1     |    8    |       0.25       | `01-34-`
+
+* ALC (Arithmetic Logic Complex/Channel) is an execution port for RISC-like instructions
+
 ## How to build
 
 build x64 version:
@@ -73,6 +115,10 @@ build loongarch64 version:
 
 `./build_loongarch64.sh`
 
+build e2k version:
+
+`./build_e2k.sh`
+
 clean:
 
 `./clean.sh`
@@ -95,6 +141,8 @@ clean:
 
 [loongarch64 cpufp benchmark results](benchmark_result/loongarch64.md)
 
+[e2k cpufp benchmark results](benchmark_result/e2k.md)
+
 ## Todo list
 
 Add armv9(SVE, SVE2 & SME) Supports.

diff --git a/benchmark_result/e2k.md b/benchmark_result/e2k.md
@@ -12,10 +12,10 @@ Number Threads: 1
 Thread Pool Binding: 0
 --------------------------------------------------------------
 | Instruction Set | Core Computation      | Peak Performance |
-| v4              | ADD(MUL(f32,f32),f32) | 28.727 GFLOPS    |
-| v4              | ADD(MUL(f64,f64),f64) | 14.353 GFLOPS    |
 | v5              | ADD(MUL(f32,f32),f32) | 57.413 GFLOPS    |
 | v5              | ADD(MUL(f64,f64),f64) | 28.707 GFLOPS    |
+| v4              | ADD(MUL(f32,f32),f32) | 28.727 GFLOPS    |
+| v4              | ADD(MUL(f64,f64),f64) | 14.353 GFLOPS    |
 --------------------------------------------------------------
 </pre>
 
@@ -27,10 +27,10 @@ Number Threads: 8
 Thread Pool Binding: 0 1 2 3 4 5 6 7
 --------------------------------------------------------------
 | Instruction Set | Core Computation      | Peak Performance |
-| v4              | ADD(MUL(f32,f32),f32) | 229.76 GFLOPS    |
-| v4              | ADD(MUL(f64,f64),f64) | 114.89 GFLOPS    |
 | v5              | ADD(MUL(f32,f32),f32) | 459.61 GFLOPS    |
 | v5              | ADD(MUL(f64,f64),f64) | 229.72 GFLOPS    |
+| v4              | ADD(MUL(f32,f32),f32) | 229.76 GFLOPS    |
+| v4              | ADD(MUL(f64,f64),f64) | 114.89 GFLOPS    |
 --------------------------------------------------------------
 </pre>
 
@@ -42,10 +42,10 @@ Number Threads: 32
 Thread Pool Binding: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
 --------------------------------------------------------------
 | Instruction Set | Core Computation      | Peak Performance |
-| v4              | ADD(MUL(f32,f32),f32) | 917.56 GFLOPS    |
-| v4              | ADD(MUL(f64,f64),f64) | 458.77 GFLOPS    |
 | v5              | ADD(MUL(f32,f32),f32) | 1.835 TFLOPS     |
 | v5              | ADD(MUL(f64,f64),f64) | 917.64 GFLOPS    |
+| v4              | ADD(MUL(f32,f32),f32) | 917.56 GFLOPS    |
+| v4              | ADD(MUL(f64,f64),f64) | 458.77 GFLOPS    |
 --------------------------------------------------------------
 </pre>
 

diff --git a/e2k/asm.S b/e2k/asm.S
@@ -39,6 +39,22 @@
         rwd,0 %g16, %lsr
         nop 3 // NOTE: low delay may lead to undefined behaviour
     }
+
+    // NOTE: `{,p,qp}fmul_add{s,d}` has latency 8 cycles thus we need at least
+    // 8 registers for each channel.
+    //
+    // Example for ALC0:
+    // I:   read (b0)   write (b20)
+    // 0:   r8          r28     # use r28 as dst
+    // 1:   r10         r30
+    // 2:   r12         r32
+    // 3:   r16         r34
+    // 4:   r18         r36     # r28 is ready if fmuld (4)
+    // 5:   r20         r38     # r28 is ready if fmad (5)
+    // 6:   r22         r40
+    // 7:   r24         r42
+    // 8:   r26         r44     # r28 is ready if fmul_addd (8)
+    // 9:   r28         r46     # read from r28 (+1 just in case)
 1:
     {
         loop_mode
@@ -49,6 +65,7 @@
         \op,3 %b[1],  %b[1],  %b[1],  %b[21]
         \op,4 %b[21], %b[21], %b[21], %b[41]
 #if __iset__ >= 4
+        // NOTE: v1-v3 does not support fops in ALC2/ALC5
         \op,2 %b[40], %b[40], %b[40], %b[0]
         \op,5 %b[41], %b[41], %b[41], %b[1]
 #endif
@@ -62,15 +79,15 @@
 
     .text
 
-    impl_bench pfmul_adds, pfmul_adds
-    impl_bench fmul_addd, fmul_addd
-
 #if __iset__ >= 5
-    impl_bench qpfmul_adds, qpfmul_adds
-    impl_bench qpfmul_addd, qpfmul_addd
+    impl_bench bench_qpfmul_adds, qpfmul_adds
+    impl_bench bench_qpfmul_addd, qpfmul_addd
 #endif
 
 #if __iset__ >= 6
-    impl_bench qpfmas, qpfmas
-    impl_bench qpfmad, qpfmad
+    impl_bench bench_qpfmas, qpfmas
+    impl_bench bench_qpfmad, qpfmad
 #endif
+
+    impl_bench bench_pfmul_adds, pfmul_adds
+    impl_bench bench_fmul_addd, fmul_addd
diff --git a/e2k/cpufp.cpp b/e2k/cpufp.cpp
@@ -14,18 +14,19 @@ using namespace std;
 
 extern "C"
 {
-    void pfmul_adds(int64_t, void *params);
-    void fmul_addd(int64_t, void *params);
 
-#if __iset__ >= 5
-    void qpfmul_adds(int64_t, void *params);
-    void qpfmul_addd(int64_t, void *params);
+#if __iset__ >= 6
+    void bench_qpfmas(int64_t, void *params);
+    void bench_qpfmad(int64_t, void *params);
 #endif
 
-#if __iset__ >= 6
-    void qpfmas(int64_t, void *params);
-    void qpfmad(int64_t, void *params);
+#if __iset__ >= 5
+    void bench_qpfmul_adds(int64_t, void *params);
+    void bench_qpfmul_addd(int64_t, void *params);
 #endif
+
+    void bench_pfmul_adds(int64_t, void *params);
+    void bench_fmul_addd(int64_t, void *params);
 }
 
 typedef struct
@@ -251,30 +252,30 @@ static void cpufp_register_isa()
     // NOTE: do not use values greater than UINT32_MAX
     const uint32_t loop_time = 0x20000000;
 
-#if __iset__ <= 3
-    reg_new_isa("v1", "ADD(MUL(f32,f32),f32)", "FLOPS",
-        loop_time, 16LL, NULL, pfmul_adds);
-    reg_new_isa("v1", "ADD(MUL(f64,f64),f64)", "FLOPS",
-        loop_time,  8LL, NULL, fmul_addd);
-#elif __iset__ >= 4
-    reg_new_isa("v4", "ADD(MUL(f32,f32),f32)", "FLOPS",
-        loop_time, 24LL, NULL, pfmul_adds);
-    reg_new_isa("v4", "ADD(MUL(f64,f64),f64)", "FLOPS",
-        loop_time, 12LL, NULL, fmul_addd);
+#if __iset__ >= 6
+    reg_new_isa("v6", "FMA(f32,f32,f32)", "FLOPS",
+        loop_time, 48LL, NULL, bench_qpfmas);
+    reg_new_isa("v6", "FMA(f64,f64,f64)", "FLOPS",
+        loop_time, 24LL, NULL, bench_qpfmad);
 #endif
 
 #if __iset__ >= 5
-    reg_new_isa("v5", "ADD(MUL(f32,f32),f32)", "FLOPS",
-        loop_time, 48LL, NULL, qpfmul_adds);
-    reg_new_isa("v5", "ADD(MUL(f64,f64),f64)", "FLOPS",
-        loop_time, 24LL, NULL, qpfmul_addd);
+    reg_new_isa("v5", "ADD(f32,MUL(f32,f32))", "FLOPS",
+        loop_time, 48LL, NULL, bench_qpfmul_adds);
+    reg_new_isa("v5", "ADD(f64,MUL(f64,f64))", "FLOPS",
+        loop_time, 24LL, NULL, bench_qpfmul_addd);
 #endif
 
-#if __iset__ >= 6
-    reg_new_isa("v6", "FMA(f32,f32,f32)", "FLOPS",
-        loop_time, 48LL, NULL, qpfmas);
-    reg_new_isa("v6", "FMA(f64,f64,f64)", "FLOPS",
-        loop_time, 24LL, NULL, qpfmad);
+#if __iset__ >= 4
+    reg_new_isa("v4", "ADD(f32,MUL(f32,f32))", "FLOPS",
+        loop_time, 24LL, NULL, bench_pfmul_adds);
+    reg_new_isa("v4", "ADD(f64,MUL(f64,f64))", "FLOPS",
+        loop_time, 12LL, NULL, bench_fmul_addd);
+#else
+    reg_new_isa("v1", "ADD(f32,MUL(f32,f32))", "FLOPS",
+        loop_time, 16LL, NULL, bench_pfmul_adds);
+    reg_new_isa("v1", "ADD(f64,MUL(f64,f64))", "FLOPS",
+        loop_time,  8LL, NULL, bench_fmul_addd);
 #endif
 }
 
@@ -316,4 +317,3 @@ int main(int argc, char *argv[])
 
     return 0;
 }
-