From 905ded711f93fa3c94faa7623a5093525338fdeb Mon Sep 17 00:00:00 2001 From: athena Date: Sun, 28 Oct 2012 18:42:48 -0400 Subject: [PATCH] make the index-computation logic less paranoid The problem is that for each K and for each expression of the form P[I + STRIDE * K] in a loop, most compilers will try to lift an induction variable PK := &P[I + STRIDE * K]. In large codelets we have many such values of K. For example, a codelet of size 32 with 4 input pointers will generate O(128) induction variables, which will likely overflow the register set, which is likely worse than doing the index computation in the first place. In the past we (wisely and correctly) assumed that compilers will do the wrong thing, and consequently we disabled the induction-variable "optimization" altogether by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to be 0. Since the compiler does not know that ZERO=0, it cannot perform its "optimization" and it is forced to behave sensibly. With this patch, FFTW is a little bit less paranoid. FFTW now disables the induction-variable optimization" only when we estimate that the codelet uses more than ESTIMATED_AVAILABLE_INDEX_REGISTERS induction variables. Currently we set ESTIMATED_AVAILABLE_INDEX_REGISTERS=16. 16 registers ought to be enough for anybody (or so the amd64 and ARM ISA's seem to imply). --- genfft/gen_hc2c.ml | 2 +- genfft/gen_hc2cdft.ml | 2 +- genfft/gen_hc2cdft_c.ml | 2 +- genfft/gen_hc2hc.ml | 2 +- genfft/gen_notw.ml | 4 ++-- genfft/gen_notw_c.ml | 4 ++-- genfft/gen_r2cb.ml | 6 +++--- genfft/gen_r2cf.ml | 6 +++--- genfft/gen_r2r.ml | 4 ++-- genfft/gen_twiddle.ml | 2 +- genfft/gen_twiddle_c.ml | 2 +- genfft/gen_twidsq.ml | 4 ++-- genfft/gen_twidsq_c.ml | 4 ++-- genfft/genutil.ml | 3 ++- kernel/ifftw.h | 25 +++++++++++++++++++++---- 15 files changed, 45 insertions(+), 27 deletions(-) diff --git a/genfft/gen_hc2c.ml b/genfft/gen_hc2c.ml index 788ffba5b..14bf40868 100644 --- a/genfft/gen_hc2c.ml +++ b/genfft/gen_hc2c.ml @@ -138,7 +138,7 @@ let generate n = Expr_assign (vaim, CPlus [vaim; CUminus (byvl vms)]); Expr_assign (CVar twarray, CPlus [CVar twarray; byvl (Integer nt)]); - make_volatile_stride (CVar rs) + make_volatile_stride (4*n) (CVar rs) ], Asch asch)]) in diff --git a/genfft/gen_hc2cdft.ml b/genfft/gen_hc2cdft.ml index 0fcce6e7f..268b85b91 100644 --- a/genfft/gen_hc2cdft.ml +++ b/genfft/gen_hc2cdft.ml @@ -159,7 +159,7 @@ let generate n = Expr_assign (vaim, CPlus [vaim; CUminus (byvl vms)]); Expr_assign (CVar twarray, CPlus [CVar twarray; byvl (Integer nt)]); - make_volatile_stride (CVar rs) + make_volatile_stride (4*n) (CVar rs) ], Asch asch)] ) diff --git a/genfft/gen_hc2cdft_c.ml b/genfft/gen_hc2cdft_c.ml index c83f7169a..1a4e9aeef 100644 --- a/genfft/gen_hc2cdft_c.ml +++ b/genfft/gen_hc2cdft_c.ml @@ -171,7 +171,7 @@ let generate n = Expr_assign (vaim, CPlus [vaim; CUminus (byvl vms)]); Expr_assign (CVar twarray, CPlus [CVar twarray; bytwvl (Integer nt)]); - make_volatile_stride (CVar rs) + make_volatile_stride (4*n) (CVar rs) ], Asch asch)] ) diff --git a/genfft/gen_hc2hc.ml b/genfft/gen_hc2hc.ml index 47288be90..192caa57e 100644 --- a/genfft/gen_hc2hc.ml +++ b/genfft/gen_hc2hc.ml @@ -124,7 +124,7 @@ let generate n = CPlus [viioarray; CUminus (byvl vms)]); Expr_assign (CVar twarray, CPlus [CVar twarray; byvl (Integer nt)]); - make_volatile_stride (CVar rs) + make_volatile_stride (2*n) (CVar rs) ], Asch asch)]) in diff --git a/genfft/gen_notw.ml b/genfft/gen_notw.ml index 461eb936e..69bc94037 100644 --- a/genfft/gen_notw.ml +++ b/genfft/gen_notw.ml @@ -122,8 +122,8 @@ let generate n = byvl (CVar sovs)]); Expr_assign (CVar ioarray, CPlus [CVar ioarray; byvl (CVar sovs)]); - make_volatile_stride (CVar istride); - make_volatile_stride (CVar ostride) + make_volatile_stride (4*n) (CVar istride); + make_volatile_stride (4*n) (CVar ostride) ], Asch annot) ]) diff --git a/genfft/gen_notw_c.ml b/genfft/gen_notw_c.ml index 3b5853f3a..86609dad4 100644 --- a/genfft/gen_notw_c.ml +++ b/genfft/gen_notw_c.ml @@ -117,8 +117,8 @@ let generate n = byvl (CVar sivs)]); Expr_assign (CVar roarray, CPlus [CVar roarray; byvl (CVar sovs)]); - make_volatile_stride (CVar istride); - make_volatile_stride (CVar ostride) + make_volatile_stride (2*n) (CVar istride); + make_volatile_stride (2*n) (CVar ostride) ], Asch annot); ]) diff --git a/genfft/gen_r2cb.ml b/genfft/gen_r2cb.ml index f1335fcd4..074b3ab7b 100644 --- a/genfft/gen_r2cb.ml +++ b/genfft/gen_r2cb.ml @@ -121,9 +121,9 @@ let generate n = Expr_assign (CVar ar1, CPlus [CVar ar1; CVar sovs]); Expr_assign (CVar acr, CPlus [CVar acr; CVar sivs]); Expr_assign (CVar aci, CPlus [CVar aci; CVar sivs]); - make_volatile_stride (CVar rs); - make_volatile_stride (CVar csr); - make_volatile_stride (CVar csi) + make_volatile_stride (4*n) (CVar rs); + make_volatile_stride (4*n) (CVar csr); + make_volatile_stride (4*n) (CVar csi) ], Asch annot) ]) diff --git a/genfft/gen_r2cf.ml b/genfft/gen_r2cf.ml index 47d105285..a23c377bb 100644 --- a/genfft/gen_r2cf.ml +++ b/genfft/gen_r2cf.ml @@ -118,9 +118,9 @@ let generate n = Expr_assign (CVar ar1, CPlus [CVar ar1; CVar sivs]); Expr_assign (CVar acr, CPlus [CVar acr; CVar sovs]); Expr_assign (CVar aci, CPlus [CVar aci; CVar sovs]); - make_volatile_stride (CVar rs); - make_volatile_stride (CVar csr); - make_volatile_stride (CVar csi) + make_volatile_stride (4*n) (CVar rs); + make_volatile_stride (4*n) (CVar csr); + make_volatile_stride (4*n) (CVar csi) ], Asch annot) ]) diff --git a/genfft/gen_r2r.ml b/genfft/gen_r2r.ml index b926359fd..a63c910bb 100644 --- a/genfft/gen_r2r.ml +++ b/genfft/gen_r2r.ml @@ -197,8 +197,8 @@ let generate n mode = [Expr_assign (CVar i, CPlus [CVar i; CUminus (Integer 1)]); Expr_assign (CVar iarray, CPlus [CVar iarray; CVar sivs]); Expr_assign (CVar oarray, CPlus [CVar oarray; CVar sovs]); - make_volatile_stride (CVar istride); - make_volatile_stride (CVar ostride) + make_volatile_stride (2*n) (CVar istride); + make_volatile_stride (2*n) (CVar ostride) ], Asch annot) ]) diff --git a/genfft/gen_twiddle.ml b/genfft/gen_twiddle.ml index 35bdf7bd4..6a8dc1659 100644 --- a/genfft/gen_twiddle.ml +++ b/genfft/gen_twiddle.ml @@ -108,7 +108,7 @@ let generate n = byvl (CVar sms)]); Expr_assign (CVar twarray, CPlus [CVar twarray; byvl (Integer nt)]); - make_volatile_stride (CVar rs) + make_volatile_stride (2*n) (CVar rs) ], Asch annot)]) in diff --git a/genfft/gen_twiddle_c.ml b/genfft/gen_twiddle_c.ml index 1896d3569..6fea68fe7 100644 --- a/genfft/gen_twiddle_c.ml +++ b/genfft/gen_twiddle_c.ml @@ -112,7 +112,7 @@ let generate n = byvl (CVar sms)]); Expr_assign (CVar twarray, CPlus [CVar twarray; bytwvl (Integer nt)]); - make_volatile_stride (CVar rs) + make_volatile_stride n (CVar rs) ], Asch annot)]) in diff --git a/genfft/gen_twidsq.ml b/genfft/gen_twidsq.ml index 7d5802283..aee6b01ea 100644 --- a/genfft/gen_twidsq.ml +++ b/genfft/gen_twidsq.ml @@ -123,8 +123,8 @@ let generate n = Expr_assign (CVar rioarray, CPlus [CVar rioarray; CVar ms]); Expr_assign (CVar iioarray, CPlus [CVar iioarray; CVar ms]); Expr_assign (CVar twarray, CPlus [CVar twarray; Integer nt]); - make_volatile_stride (CVar rs); - make_volatile_stride (CVar vs) + make_volatile_stride (2*n) (CVar rs); + make_volatile_stride (2*0) (CVar vs) ], Asch annot)]) in diff --git a/genfft/gen_twidsq_c.ml b/genfft/gen_twidsq_c.ml index 2a443ec31..ecbe3f7e5 100644 --- a/genfft/gen_twidsq_c.ml +++ b/genfft/gen_twidsq_c.ml @@ -133,8 +133,8 @@ let generate n = byvl (CVar sms)]); Expr_assign (CVar twarray, CPlus [CVar twarray; bytwvl (Integer nt)]); - make_volatile_stride (CVar rs); - make_volatile_stride (CVar vs) + make_volatile_stride (2*n) (CVar rs); + make_volatile_stride (2*n) (CVar vs) ], Asch annot)]) in diff --git a/genfft/genutil.ml b/genfft/genutil.ml index 96c68802d..b8f70a5d3 100644 --- a/genfft/genutil.ml +++ b/genfft/genutil.ml @@ -324,4 +324,5 @@ let twinstr_to_string vl x = else Twiddle.twinstr_to_c_string x -let make_volatile_stride x = C.CCall ("MAKE_VOLATILE_STRIDE", x) +let make_volatile_stride n x = + C.CCall ("MAKE_VOLATILE_STRIDE", C.Comma((C.Integer n), x)) diff --git a/kernel/ifftw.h b/kernel/ifftw.h index 932767ce5..b37007b81 100644 --- a/kernel/ifftw.h +++ b/kernel/ifftw.h @@ -827,7 +827,7 @@ extern stride X(mkstride)(INT n, INT s); void X(stride_destroy)(stride p); /* hackery to prevent the compiler from copying the strides array onto the stack */ -#define MAKE_VOLATILE_STRIDE(x) (x) = (x) + X(an_INT_guaranteed_to_be_zero) +#define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero) #else typedef INT stride; @@ -840,9 +840,26 @@ typedef INT stride; #define fftwl_stride_destroy(p) ((void) p) /* hackery to prevent the compiler from ``optimizing'' induction - variables in codelet loops. */ -#define MAKE_VOLATILE_STRIDE(x) (x) = (x) ^ X(an_INT_guaranteed_to_be_zero) - + variables in codelet loops. The problem is that for each K and for + each expression of the form P[I + STRIDE * K] in a loop, most + compilers will try to lift an induction variable PK := &P[I + STRIDE * K]. + For large values of K this behavior overflows the + register set, which is likely worse than doing the index computation + in the first place. + + If we guess that there are more than + ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse + the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to + be 0, but the compiler does not know this. + + 16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's + seem to imply. +*/ +#define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16 +#define MAKE_VOLATILE_STRIDE(nptr, x) \ + (nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ? \ + 0 : \ + ((x) = (x) ^ X(an_INT_guaranteed_to_be_zero))) #endif /* PRECOMPUTE_ARRAY_INDICES */ /*-----------------------------------------------------------------------*/