Skip to content

Commit

Permalink
shorten ESTIMATE planning time for certain weird sizes
Browse files Browse the repository at this point in the history
FFTW includes a collection of "solvers" that apply to a subset of
"problems".  Assume for simplicity that a "problem" is a single 1D
complex transform of size N, even though real "problems" are much more
general than that.  FFTW includes three "prime" solvers called
"generic", "bluestein", and "rader", which implement different
algorithms for prime sizes.

Now, for a "problem" of size 13 (say) FFTW also includes special code
that handles that size at high speed.  It would be a waste of time to
measure the execution time of the prime solvers, since we know that
the special code is way faster.  However, FFTW is modular and one may
or may not include the special code for size 13, in which case we must
resort to one of the "prime" solvers.  To address this issue, the
"prime" solvers (and others) are proclaimed to be SLOW".  When
planning, FFTW first tries to produce a plan ignoring all the SLOW
solvers, and if this fails FFTW tries again allowing SLOW solvers.

This heuristic works ok unless the sizes are too large.  For example
for 1044000=2*2*2*2*2*3*3*5*5*5*29 FFTW explores a huge search tree of
all zillion factorizations of 1044000/29, failing every time because
29 is SLOW; then it finally allows SLOW solvers and finds a solution
immediately.

This patch proclaims solvers to be SLOW only for small values of N.
For example, the "generic" solver implements an O(n^2) DFT algorithm;
we say that it is SLOW only for N<=16.

The side effects of this choice are as follows.  If one modifies FFTW to
include a fast solver of size 17, then planning for N=17*K will be
slower than today, because FFTW till try both the fast solver and the
generic solver (which is SLOW today and therefore not tried, but is no
longer SLOW after the patch).  If one removes a fast solver, of size say
13, then he may still fall into the current exponential-search behavior
for "problems" of size 13*HIGHLY_FACTORIZABLE_N.

If somebody had compleined about transforms of size 1044000 ten years
ago, "don't do that" would have been an acceptable answer.  I guess the
bar is higher today, so I am going to include this patch in our 3.3.1
release despite their side-effects for people who want to modify FFTW.
  • Loading branch information
matteo-frigo committed Sep 3, 2011
1 parent 610f797 commit f004d76
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 48 deletions.
19 changes: 14 additions & 5 deletions dft/bluestein.c
Original file line number Diff line number Diff line change
Expand Up @@ -163,9 +163,19 @@ static int applicable(const solver *ego, const problem *p_,
const planner *plnr)
{
UNUSED(ego);
if (NO_SLOWP(plnr)) return 0;
if (!applicable0(p_)) return 0;
return 1;
const problem_dft *p = (const problem_dft *) p_;
return (1
&& p->sz->rnk == 1
&& p->vecsz->rnk == 0
/* FIXME: allow other sizes */
&& X(is_prime)(p->sz->dims[0].n)

/* FIXME: avoid infinite recursion of bluestein with itself.
This works because all factors in child problems are 2, 3, 5 */
&& p->sz->dims[0].n > 16

&& CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > BLUESTEIN_MAX_SLOW)
);
}

static void destroy(plan *ego_)
Expand All @@ -183,8 +193,7 @@ static void print(const plan *ego_, printer *p)

static INT choose_transform_size(INT minsz)
{
static const INT primes[] = { 2, 3, 5, 0 };
while (!X(factors_into)(minsz, primes))
while (!X(factors_into_small_primes)(minsz))
++minsz;
return minsz;
}
Expand Down
21 changes: 6 additions & 15 deletions dft/generic.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,31 +109,22 @@ static void print(const plan *ego_, printer *p)
p->print(p, "(dft-generic-%D)", ego->n);
}

static int applicable0(const problem *p_)
static int applicable(const solver *ego, const problem *p_,
const planner *plnr)
{
UNUSED(ego);
const problem_dft *p = (const problem_dft *) p_;

return (1
&& p->sz->rnk == 1
&& p->vecsz->rnk == 0
&& (p->sz->dims[0].n % 2) == 1
&& CIMPLIES(NO_LARGE_GENERICP(plnr), p->sz->dims[0].n < GENERIC_MIN_BAD)
&& CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > GENERIC_MAX_SLOW)
&& X(is_prime)(p->sz->dims[0].n)
);
}

static int applicable(const solver *ego, const problem *p_,
const planner *plnr)
{
UNUSED(ego);
if (NO_SLOWP(plnr)) return 0;
if (!applicable0(p_)) return 0;

if (NO_LARGE_GENERICP(plnr)) {
const problem_dft *p = (const problem_dft *) p_;
if (p->sz->dims[0].n >= GENERIC_MIN_BAD) return 0;
}
return 1;
}

static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
{
const problem_dft *p;
Expand Down
14 changes: 7 additions & 7 deletions dft/rader.c
Original file line number Diff line number Diff line change
Expand Up @@ -210,21 +210,21 @@ static void print(const plan *ego_, printer *p)
p->putchr(p, ')');
}

static int applicable0(const solver *ego_, const problem *p_)
static int applicable(const solver *ego_, const problem *p_,
const planner *plnr)
{
const problem_dft *p = (const problem_dft *) p_;
UNUSED(ego_);
return (1
&& p->sz->rnk == 1
&& p->vecsz->rnk == 0
&& CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > RADER_MAX_SLOW)
&& X(is_prime)(p->sz->dims[0].n)
);
}

static int applicable(const solver *ego_, const problem *p_,
const planner *plnr)
{
return (!NO_SLOWP(plnr) && applicable0(ego_, p_));
/* proclaim the solver SLOW if p-1 is not easily factorizable.
Bluestein should take care of this case. */
&& CIMPLIES(NO_SLOWP(plnr), X(factors_into_small_primes)(p->sz->dims[0].n - 1))
);
}

static int mkP(P *pln, INT n, INT is, INT os, R *ro, R *io,
Expand Down
8 changes: 8 additions & 0 deletions kernel/ifftw.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ extern void X(extract_reim)(int sign, R *c, R **r, R **i);

#define STRINGIZEx(x) #x
#define STRINGIZE(x) STRINGIZEx(x)
#define CIMPLIES(ante, post) (!(ante) || (post))

/* define HAVE_SIMD if any simd extensions are supported */
#if defined(HAVE_SSE) || defined(HAVE_SSE2) || defined(HAVE_ALTIVEC) || \
Expand Down Expand Up @@ -923,12 +924,19 @@ INT X(first_divisor)(INT n);
int X(is_prime)(INT n);
INT X(next_prime)(INT n);
int X(factors_into)(INT n, const INT *primes);
int X(factors_into_small_primes)(INT n);
INT X(choose_radix)(INT r, INT n);
INT X(isqrt)(INT n);
INT X(modulo)(INT a, INT n);

#define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */

/* thresholds below which certain solvers are considered SLOW. These are guesses
believed to be conservative */
#define GENERIC_MAX_SLOW 16
#define RADER_MAX_SLOW 32
#define BLUESTEIN_MAX_SLOW 24

/*-----------------------------------------------------------------------*/
/* rader.c: */
typedef struct rader_tls rader_tl;
Expand Down
6 changes: 6 additions & 0 deletions kernel/primes.c
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,9 @@ INT X(modulo)(INT a, INT n)
return (n - 1) - ((-(a + (INT)1)) % n);
}

/* TRUE if N factors into small primes */
int X(factors_into_small_primes)(INT n)
{
static const INT primes[] = { 2, 3, 5, 0 };
return X(factors_into)(n, primes);
}
15 changes: 8 additions & 7 deletions rdft/dht-rader.c
Original file line number Diff line number Diff line change
Expand Up @@ -246,24 +246,25 @@ static void print(const plan *ego_, printer *p)
p->putchr(p, ')');
}

static int applicable0(const problem *p_)
static int applicable(const solver *ego, const problem *p_, const planner *plnr)
{
UNUSED(ego);
const problem_rdft *p = (const problem_rdft *) p_;
return (1
&& p->sz->rnk == 1
&& p->vecsz->rnk == 0
&& p->kind[0] == DHT
&& X(is_prime)(p->sz->dims[0].n)
&& p->sz->dims[0].n > 2
&& CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > RADER_MAX_SLOW)
/* proclaim the solver SLOW if p-1 is not easily
factorizable. Unlike in the complex case where
Bluestein can solve the problem, in the DHT case we
may have no other choice */
&& CIMPLIES(NO_SLOWP(plnr), X(factors_into_small_primes)(p->sz->dims[0].n - 1))
);
}

static int applicable(const solver *ego, const problem *p, const planner *plnr)
{
UNUSED(ego);
return (!NO_SLOWP(plnr) && applicable0(p));
}

static INT choose_transform_size(INT minsz)
{
static const INT primes[] = { 2, 3, 5, 0 };
Expand Down
18 changes: 4 additions & 14 deletions rdft/generic.c
Original file line number Diff line number Diff line change
Expand Up @@ -168,31 +168,21 @@ static void print(const plan *ego_, printer *p)
ego->n);
}

static int applicable0(const S *ego, const problem *p_)
static int applicable(const S *ego, const problem *p_,
const planner *plnr)
{
const problem_rdft *p = (const problem_rdft *) p_;
return (1
&& p->sz->rnk == 1
&& p->vecsz->rnk == 0
&& (p->sz->dims[0].n % 2) == 1
&& CIMPLIES(NO_LARGE_GENERICP(plnr), p->sz->dims[0].n < GENERIC_MIN_BAD)
&& CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > GENERIC_MAX_SLOW)
&& X(is_prime)(p->sz->dims[0].n)
&& p->kind[0] == ego->kind
);
}

static int applicable(const S *ego, const problem *p_,
const planner *plnr)
{
if (NO_SLOWP(plnr)) return 0;
if (!applicable0(ego, p_)) return 0;

if (NO_LARGE_GENERICP(plnr)) {
const problem_rdft *p = (const problem_rdft *) p_;
if (p->sz->dims[0].n >= GENERIC_MIN_BAD) return 0;
}
return 1;
}

static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
{
const S *ego = (const S *)ego_;
Expand Down

0 comments on commit f004d76

Please sign in to comment.