Skip to content

Commit

Permalink
make the index-computation logic less paranoid
Browse files Browse the repository at this point in the history
The problem is that for each K and for each expression of the form P[I
+ STRIDE * K] in a loop, most compilers will try to lift an induction
variable PK := &P[I + STRIDE * K].  In large codelets we have many
such values of K.  For example, a codelet of size 32 with 4 input
pointers will generate O(128) induction variables, which will likely
overflow the register set, which is likely worse than doing the index
computation in the first place.

In the past we (wisely and correctly) assumed that compilers will do
the wrong thing, and consequently we disabled the induction-variable
"optimization" altogether by setting STRIDE ^= ZERO, where ZERO is a
value guaranteed to be 0.  Since the compiler does not know that
ZERO=0, it cannot perform its "optimization" and it is forced to
behave sensibly.

With this patch, FFTW is a little bit less paranoid.  FFTW now
disables the induction-variable optimization" only when we estimate
that the codelet uses more than ESTIMATED_AVAILABLE_INDEX_REGISTERS
induction variables.

Currently we set ESTIMATED_AVAILABLE_INDEX_REGISTERS=16.  16 registers ought
to be enough for anybody (or so the amd64 and ARM ISA's seem to imply).
  • Loading branch information
matteo-frigo committed Oct 28, 2012
1 parent 1dacef5 commit 905ded7
Show file tree
Hide file tree
Showing 15 changed files with 45 additions and 27 deletions.
2 changes: 1 addition & 1 deletion genfft/gen_hc2c.ml
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ let generate n =
Expr_assign (vaim, CPlus [vaim; CUminus (byvl vms)]);
Expr_assign (CVar twarray, CPlus [CVar twarray;
byvl (Integer nt)]);
make_volatile_stride (CVar rs)
make_volatile_stride (4*n) (CVar rs)
],
Asch asch)])
in
Expand Down
2 changes: 1 addition & 1 deletion genfft/gen_hc2cdft.ml
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ let generate n =
Expr_assign (vaim, CPlus [vaim; CUminus (byvl vms)]);
Expr_assign (CVar twarray, CPlus [CVar twarray;
byvl (Integer nt)]);
make_volatile_stride (CVar rs)
make_volatile_stride (4*n) (CVar rs)
],
Asch asch)]
)
Expand Down
2 changes: 1 addition & 1 deletion genfft/gen_hc2cdft_c.ml
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ let generate n =
Expr_assign (vaim, CPlus [vaim; CUminus (byvl vms)]);
Expr_assign (CVar twarray, CPlus [CVar twarray;
bytwvl (Integer nt)]);
make_volatile_stride (CVar rs)
make_volatile_stride (4*n) (CVar rs)
],
Asch asch)]
)
Expand Down
2 changes: 1 addition & 1 deletion genfft/gen_hc2hc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ let generate n =
CPlus [viioarray; CUminus (byvl vms)]);
Expr_assign (CVar twarray, CPlus [CVar twarray;
byvl (Integer nt)]);
make_volatile_stride (CVar rs)
make_volatile_stride (2*n) (CVar rs)
],
Asch asch)])
in
Expand Down
4 changes: 2 additions & 2 deletions genfft/gen_notw.ml
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,8 @@ let generate n =
byvl (CVar sovs)]);
Expr_assign (CVar ioarray, CPlus [CVar ioarray;
byvl (CVar sovs)]);
make_volatile_stride (CVar istride);
make_volatile_stride (CVar ostride)
make_volatile_stride (4*n) (CVar istride);
make_volatile_stride (4*n) (CVar ostride)
],
Asch annot)
])
Expand Down
4 changes: 2 additions & 2 deletions genfft/gen_notw_c.ml
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,8 @@ let generate n =
byvl (CVar sivs)]);
Expr_assign (CVar roarray, CPlus [CVar roarray;
byvl (CVar sovs)]);
make_volatile_stride (CVar istride);
make_volatile_stride (CVar ostride)
make_volatile_stride (2*n) (CVar istride);
make_volatile_stride (2*n) (CVar ostride)
],
Asch annot);
])
Expand Down
6 changes: 3 additions & 3 deletions genfft/gen_r2cb.ml
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,9 @@ let generate n =
Expr_assign (CVar ar1, CPlus [CVar ar1; CVar sovs]);
Expr_assign (CVar acr, CPlus [CVar acr; CVar sivs]);
Expr_assign (CVar aci, CPlus [CVar aci; CVar sivs]);
make_volatile_stride (CVar rs);
make_volatile_stride (CVar csr);
make_volatile_stride (CVar csi)
make_volatile_stride (4*n) (CVar rs);
make_volatile_stride (4*n) (CVar csr);
make_volatile_stride (4*n) (CVar csi)
],
Asch annot)
])
Expand Down
6 changes: 3 additions & 3 deletions genfft/gen_r2cf.ml
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,9 @@ let generate n =
Expr_assign (CVar ar1, CPlus [CVar ar1; CVar sivs]);
Expr_assign (CVar acr, CPlus [CVar acr; CVar sovs]);
Expr_assign (CVar aci, CPlus [CVar aci; CVar sovs]);
make_volatile_stride (CVar rs);
make_volatile_stride (CVar csr);
make_volatile_stride (CVar csi)
make_volatile_stride (4*n) (CVar rs);
make_volatile_stride (4*n) (CVar csr);
make_volatile_stride (4*n) (CVar csi)
],
Asch annot)
])
Expand Down
4 changes: 2 additions & 2 deletions genfft/gen_r2r.ml
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,8 @@ let generate n mode =
[Expr_assign (CVar i, CPlus [CVar i; CUminus (Integer 1)]);
Expr_assign (CVar iarray, CPlus [CVar iarray; CVar sivs]);
Expr_assign (CVar oarray, CPlus [CVar oarray; CVar sovs]);
make_volatile_stride (CVar istride);
make_volatile_stride (CVar ostride)
make_volatile_stride (2*n) (CVar istride);
make_volatile_stride (2*n) (CVar ostride)
],
Asch annot)
])
Expand Down
2 changes: 1 addition & 1 deletion genfft/gen_twiddle.ml
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ let generate n =
byvl (CVar sms)]);
Expr_assign (CVar twarray, CPlus [CVar twarray;
byvl (Integer nt)]);
make_volatile_stride (CVar rs)
make_volatile_stride (2*n) (CVar rs)
],
Asch annot)])
in
Expand Down
2 changes: 1 addition & 1 deletion genfft/gen_twiddle_c.ml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ let generate n =
byvl (CVar sms)]);
Expr_assign (CVar twarray, CPlus [CVar twarray;
bytwvl (Integer nt)]);
make_volatile_stride (CVar rs)
make_volatile_stride n (CVar rs)
],
Asch annot)])
in
Expand Down
4 changes: 2 additions & 2 deletions genfft/gen_twidsq.ml
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ let generate n =
Expr_assign (CVar rioarray, CPlus [CVar rioarray; CVar ms]);
Expr_assign (CVar iioarray, CPlus [CVar iioarray; CVar ms]);
Expr_assign (CVar twarray, CPlus [CVar twarray; Integer nt]);
make_volatile_stride (CVar rs);
make_volatile_stride (CVar vs)
make_volatile_stride (2*n) (CVar rs);
make_volatile_stride (2*0) (CVar vs)
],
Asch annot)]) in

Expand Down
4 changes: 2 additions & 2 deletions genfft/gen_twidsq_c.ml
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ let generate n =
byvl (CVar sms)]);
Expr_assign (CVar twarray, CPlus [CVar twarray;
bytwvl (Integer nt)]);
make_volatile_stride (CVar rs);
make_volatile_stride (CVar vs)
make_volatile_stride (2*n) (CVar rs);
make_volatile_stride (2*n) (CVar vs)
],
Asch annot)]) in

Expand Down
3 changes: 2 additions & 1 deletion genfft/genutil.ml
Original file line number Diff line number Diff line change
Expand Up @@ -324,4 +324,5 @@ let twinstr_to_string vl x =
else
Twiddle.twinstr_to_c_string x

let make_volatile_stride x = C.CCall ("MAKE_VOLATILE_STRIDE", x)
let make_volatile_stride n x =
C.CCall ("MAKE_VOLATILE_STRIDE", C.Comma((C.Integer n), x))
25 changes: 21 additions & 4 deletions kernel/ifftw.h
Original file line number Diff line number Diff line change
Expand Up @@ -827,7 +827,7 @@ extern stride X(mkstride)(INT n, INT s);
void X(stride_destroy)(stride p);
/* hackery to prevent the compiler from copying the strides array
onto the stack */
#define MAKE_VOLATILE_STRIDE(x) (x) = (x) + X(an_INT_guaranteed_to_be_zero)
#define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero)
#else

typedef INT stride;
Expand All @@ -840,9 +840,26 @@ typedef INT stride;
#define fftwl_stride_destroy(p) ((void) p)

/* hackery to prevent the compiler from ``optimizing'' induction
variables in codelet loops. */
#define MAKE_VOLATILE_STRIDE(x) (x) = (x) ^ X(an_INT_guaranteed_to_be_zero)

variables in codelet loops. The problem is that for each K and for
each expression of the form P[I + STRIDE * K] in a loop, most
compilers will try to lift an induction variable PK := &P[I + STRIDE * K].
For large values of K this behavior overflows the
register set, which is likely worse than doing the index computation
in the first place.
If we guess that there are more than
ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse
the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to
be 0, but the compiler does not know this.
16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's
seem to imply.
*/
#define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16
#define MAKE_VOLATILE_STRIDE(nptr, x) \
(nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ? \
0 : \
((x) = (x) ^ X(an_INT_guaranteed_to_be_zero)))
#endif /* PRECOMPUTE_ARRAY_INDICES */

/*-----------------------------------------------------------------------*/
Expand Down

0 comments on commit 905ded7

Please sign in to comment.