shorten ESTIMATE planning time for certain weird sizes

FFTW includes a collection of "solvers" that apply to a subset of "problems". Assume for simplicity that a "problem" is a single 1D complex transform of size N, even though real "problems" are much more general than that. FFTW includes three "prime" solvers called "generic", "bluestein", and "rader", which implement different algorithms for prime sizes. Now, for a "problem" of size 13 (say) FFTW also includes special code that handles that size at high speed. It would be a waste of time to measure the execution time of the prime solvers, since we know that the special code is way faster. However, FFTW is modular and one may or may not include the special code for size 13, in which case we must resort to one of the "prime" solvers. To address this issue, the "prime" solvers (and others) are proclaimed to be SLOW". When planning, FFTW first tries to produce a plan ignoring all the SLOW solvers, and if this fails FFTW tries again allowing SLOW solvers. This heuristic works ok unless the sizes are too large. For example for 1044000=2*2*2*2*2*3*3*5*5*5*29 FFTW explores a huge search tree of all zillion factorizations of 1044000/29, failing every time because 29 is SLOW; then it finally allows SLOW solvers and finds a solution immediately. This patch proclaims solvers to be SLOW only for small values of N. For example, the "generic" solver implements an O(n^2) DFT algorithm; we say that it is SLOW only for N<=16. The side effects of this choice are as follows. If one modifies FFTW to include a fast solver of size 17, then planning for N=17*K will be slower than today, because FFTW till try both the fast solver and the generic solver (which is SLOW today and therefore not tried, but is no longer SLOW after the patch). If one removes a fast solver, of size say 13, then he may still fall into the current exponential-search behavior for "problems" of size 13*HIGHLY_FACTORIZABLE_N. If somebody had compleined about transforms of size 1044000 ten years ago, "don't do that" would have been an acceptable answer. I guess the bar is higher today, so I am going to include this patch in our 3.3.1 release despite their side-effects for people who want to modify FFTW.
esnyder · Sep 3, 2011 · f004d76 · f004d76
1 parent 610f797
commit f004d76
Show file tree

Hide file tree

Showing 7 changed files with 53 additions and 48 deletions.
diff --git a/dft/bluestein.c b/dft/bluestein.c
@@ -163,9 +163,19 @@ static int applicable(const solver *ego, const problem *p_,
 		      const planner *plnr)
 {
      UNUSED(ego);
-     if (NO_SLOWP(plnr)) return 0;
-     if (!applicable0(p_)) return 0;
-     return 1;
+     const problem_dft *p = (const problem_dft *) p_;
+     return (1
+	     && p->sz->rnk == 1
+	     && p->vecsz->rnk == 0
+	     /* FIXME: allow other sizes */
+	     && X(is_prime)(p->sz->dims[0].n)
+
+	     /* FIXME: avoid infinite recursion of bluestein with itself.
+		This works because all factors in child problems are 2, 3, 5 */
+	     && p->sz->dims[0].n > 16
+
+	     && CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > BLUESTEIN_MAX_SLOW)
+	  );
 }
 
 static void destroy(plan *ego_)
@@ -183,8 +193,7 @@ static void print(const plan *ego_, printer *p)
 
 static INT choose_transform_size(INT minsz)
 {
-     static const INT primes[] = { 2, 3, 5, 0 };
-     while (!X(factors_into)(minsz, primes))
+     while (!X(factors_into_small_primes)(minsz))
 	  ++minsz;
      return minsz;
 }

diff --git a/dft/generic.c b/dft/generic.c
@@ -109,31 +109,22 @@ static void print(const plan *ego_, printer *p)
      p->print(p, "(dft-generic-%D)", ego->n);
 }
 
-static int applicable0(const problem *p_)
+static int applicable(const solver *ego, const problem *p_, 
+		      const planner *plnr)
 {
+     UNUSED(ego);
      const problem_dft *p = (const problem_dft *) p_;
+
      return (1
 	     && p->sz->rnk == 1
 	     && p->vecsz->rnk == 0
 	     && (p->sz->dims[0].n % 2) == 1 
+	     && CIMPLIES(NO_LARGE_GENERICP(plnr), p->sz->dims[0].n < GENERIC_MIN_BAD)
+	     && CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > GENERIC_MAX_SLOW)
 	     && X(is_prime)(p->sz->dims[0].n)
 	  );
 }
 
-static int applicable(const solver *ego, const problem *p_, 
-		      const planner *plnr)
-{
-     UNUSED(ego);
-     if (NO_SLOWP(plnr)) return 0;
-     if (!applicable0(p_)) return 0;
-
-     if (NO_LARGE_GENERICP(plnr)) {
-          const problem_dft *p = (const problem_dft *) p_;
-	  if (p->sz->dims[0].n >= GENERIC_MIN_BAD) return 0; 
-     }
-     return 1;
-}
-
 static plan *mkplan(const solver *ego, const problem *p_, planner *plnr)
 {
      const problem_dft *p;

diff --git a/dft/rader.c b/dft/rader.c
@@ -210,21 +210,21 @@ static void print(const plan *ego_, printer *p)
      p->putchr(p, ')');
 }
 
-static int applicable0(const solver *ego_, const problem *p_)
+static int applicable(const solver *ego_, const problem *p_,
+		      const planner *plnr)
 {
      const problem_dft *p = (const problem_dft *) p_;
      UNUSED(ego_);
      return (1
 	     && p->sz->rnk == 1
 	     && p->vecsz->rnk == 0
+	     && CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > RADER_MAX_SLOW)
 	     && X(is_prime)(p->sz->dims[0].n)
-	  );
-}
 
-static int applicable(const solver *ego_, const problem *p_,
-		      const planner *plnr)
-{
-     return (!NO_SLOWP(plnr) && applicable0(ego_, p_));
+	     /* proclaim the solver SLOW if p-1 is not easily factorizable.
+		Bluestein should take care of this case. */
+	     && CIMPLIES(NO_SLOWP(plnr), X(factors_into_small_primes)(p->sz->dims[0].n - 1))
+	  );
 }
 
 static int mkP(P *pln, INT n, INT is, INT os, R *ro, R *io,

diff --git a/kernel/ifftw.h b/kernel/ifftw.h
@@ -94,6 +94,7 @@ extern void X(extract_reim)(int sign, R *c, R **r, R **i);
 
 #define STRINGIZEx(x) #x
 #define STRINGIZE(x) STRINGIZEx(x)
+#define CIMPLIES(ante, post) (!(ante) || (post))
 
 /* define HAVE_SIMD if any simd extensions are supported */
 #if defined(HAVE_SSE) || defined(HAVE_SSE2) || defined(HAVE_ALTIVEC) || \
@@ -923,12 +924,19 @@ INT X(first_divisor)(INT n);
 int X(is_prime)(INT n);
 INT X(next_prime)(INT n);
 int X(factors_into)(INT n, const INT *primes);
+int X(factors_into_small_primes)(INT n);
 INT X(choose_radix)(INT r, INT n);
 INT X(isqrt)(INT n);
 INT X(modulo)(INT a, INT n);
 
 #define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */
 
+/* thresholds below which certain solvers are considered SLOW.  These are guesses
+   believed to be conservative */
+#define GENERIC_MAX_SLOW     16
+#define RADER_MAX_SLOW       32
+#define BLUESTEIN_MAX_SLOW   24
+
 /*-----------------------------------------------------------------------*/
 /* rader.c: */
 typedef struct rader_tls rader_tl;

diff --git a/kernel/primes.c b/kernel/primes.c
@@ -204,3 +204,9 @@ INT X(modulo)(INT a, INT n)
 	  return (n - 1) - ((-(a + (INT)1)) % n);
 }
 
+/* TRUE if N factors into small primes */
+int X(factors_into_small_primes)(INT n)
+{
+     static const INT primes[] = { 2, 3, 5, 0 };
+     return X(factors_into)(n, primes);
+}
diff --git a/rdft/dht-rader.c b/rdft/dht-rader.c
@@ -246,24 +246,25 @@ static void print(const plan *ego_, printer *p)
      p->putchr(p, ')');
 }
 
-static int applicable0(const problem *p_)
+static int applicable(const solver *ego, const problem *p_, const planner *plnr)
 {
+     UNUSED(ego);
      const problem_rdft *p = (const problem_rdft *) p_;
      return (1
 	     && p->sz->rnk == 1
 	     && p->vecsz->rnk == 0
 	     && p->kind[0] == DHT
 	     && X(is_prime)(p->sz->dims[0].n)
 	     && p->sz->dims[0].n > 2
+	     && CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > RADER_MAX_SLOW)
+	     /* proclaim the solver SLOW if p-1 is not easily
+		factorizable.  Unlike in the complex case where
+		Bluestein can solve the problem, in the DHT case we
+		may have no other choice */
+	     && CIMPLIES(NO_SLOWP(plnr), X(factors_into_small_primes)(p->sz->dims[0].n - 1))
 	  );
 }
 
-static int applicable(const solver *ego, const problem *p, const planner *plnr)
-{
-     UNUSED(ego);
-     return (!NO_SLOWP(plnr) && applicable0(p));
-}
-
 static INT choose_transform_size(INT minsz)
 {
      static const INT primes[] = { 2, 3, 5, 0 };

diff --git a/rdft/generic.c b/rdft/generic.c
@@ -168,31 +168,21 @@ static void print(const plan *ego_, printer *p)
 	      ego->n);
 }
 
-static int applicable0(const S *ego, const problem *p_)
+static int applicable(const S *ego, const problem *p_, 
+		      const planner *plnr)
 {
      const problem_rdft *p = (const problem_rdft *) p_;
      return (1
 	     && p->sz->rnk == 1
 	     && p->vecsz->rnk == 0
 	     && (p->sz->dims[0].n % 2) == 1 
+	     && CIMPLIES(NO_LARGE_GENERICP(plnr), p->sz->dims[0].n < GENERIC_MIN_BAD)
+	     && CIMPLIES(NO_SLOWP(plnr), p->sz->dims[0].n > GENERIC_MAX_SLOW)
 	     && X(is_prime)(p->sz->dims[0].n)
 	     && p->kind[0] == ego->kind
 	  );
 }
 
-static int applicable(const S *ego, const problem *p_, 
-		      const planner *plnr)
-{
-     if (NO_SLOWP(plnr)) return 0;
-     if (!applicable0(ego, p_)) return 0;
-
-     if (NO_LARGE_GENERICP(plnr)) {
-          const problem_rdft *p = (const problem_rdft *) p_;
-	  if (p->sz->dims[0].n >= GENERIC_MIN_BAD) return 0; 
-     }
-     return 1;
-}
-
 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
 {
      const S *ego = (const S *)ego_;