eliminate the WITH_ALIGNED_STACK hack

This is 2011 and I have no system with incorrect stack alignment.
squidruge · Jun 24, 2011 · ba838fa · ba838fa
1 parent 7e32fb6
commit ba838fa
Show file tree

Hide file tree

Showing 24 changed files with 64 additions and 172 deletions.
diff --git a/TODO b/TODO
@@ -37,5 +37,3 @@ TODO before FFTW-$2\pi$:
   to allow SIMD to be used.
 
 * memoize triggen.
-
-* eliminate alignment hacks, which ought to be obsolete by now.
diff --git a/api/apiplan.c b/api/apiplan.c
@@ -23,7 +23,7 @@
 static plan *mkplan0(planner *plnr, unsigned flags, 
 		     const problem *prb, int hash_info, 
 		     wisdom_state_t wisdom_state)
-WITH_ALIGNED_STACK({
+{
      /* map API flags into FFTW flags */
      X(mapflags)(plnr, flags);
 
@@ -32,12 +32,7 @@ WITH_ALIGNED_STACK({
 
      /* create plan */
      return plnr->adt->mkplan(plnr, prb);
-})
-
-static void aligned_awake(plan *ego, enum wakefulness wakefulness)
-WITH_ALIGNED_STACK({
-     X(plan_awake)(ego, wakefulness);
-})
+}
 
 static unsigned force_estimator(unsigned flags)
 {
@@ -143,10 +138,10 @@ apiplan *X(mkapiplan)(int sign, unsigned flags, problem *prb)
 	  if (sizeof(trigreal) > sizeof(R)) {
 	       /* this is probably faster, and we have enough trigreal
 		  bits to maintain accuracy */
-	       aligned_awake(p->pln, AWAKE_SQRTN_TABLE);
+	       X(plan_awake)(p->pln, AWAKE_SQRTN_TABLE);
 	  } else {
 	       /* more accurate */
-	       aligned_awake(p->pln, AWAKE_SINCOS);
+	       X(plan_awake)(p->pln, AWAKE_SINCOS);
 	  }
 
 	  /* we don't use pln for p->pln, above, since by re-creating the

diff --git a/api/execute-dft-c2r.c b/api/execute-dft-c2r.c
@@ -23,8 +23,8 @@
 
 /* guru interface: requires care in alignment, r - i, etcetera. */
 void X(execute_dft_c2r)(const X(plan) p, C *in, R *out)
-WITH_ALIGNED_STACK({
+{
      plan_rdft2 *pln = (plan_rdft2 *) p->pln;
      problem_rdft2 *prb = (problem_rdft2 *) p->prb;
      pln->apply((plan *) pln, out, out + (prb->r1 - prb->r0), in[0], in[0]+1);
-})
+}
diff --git a/api/execute-dft-r2c.c b/api/execute-dft-r2c.c
@@ -23,8 +23,8 @@
 
 /* guru interface: requires care in alignment, r - i, etcetera. */
 void X(execute_dft_r2c)(const X(plan) p, R *in, C *out)
-WITH_ALIGNED_STACK({
+{
      plan_rdft2 *pln = (plan_rdft2 *) p->pln;
      problem_rdft2 *prb = (problem_rdft2 *) p->prb;
      pln->apply((plan *) pln, in, in + (prb->r1 - prb->r0), out[0], out[0]+1);
-})
+}
diff --git a/api/execute-dft.c b/api/execute-dft.c
@@ -23,10 +23,10 @@
 
 /* guru interface: requires care in alignment etcetera. */
 void X(execute_dft)(const X(plan) p, C *in, C *out)
-WITH_ALIGNED_STACK({
+{
      plan_dft *pln = (plan_dft *) p->pln;
      if (p->sign == FFT_SIGN)
 	  pln->apply((plan *) pln, in[0], in[0]+1, out[0], out[0]+1);
      else
 	  pln->apply((plan *) pln, in[0]+1, in[0], out[0]+1, out[0]);
-})
+}
diff --git a/api/execute-r2r.c b/api/execute-r2r.c
@@ -23,7 +23,7 @@
 
 /* guru interface: requires care in alignment, etcetera. */
 void X(execute_r2r)(const X(plan) p, R *in, R *out)
-WITH_ALIGNED_STACK({
+{
      plan_rdft *pln = (plan_rdft *) p->pln;
      pln->apply((plan *) pln, in, out);
-})
+}
diff --git a/api/execute-split-dft-c2r.c b/api/execute-split-dft-c2r.c
@@ -23,8 +23,8 @@
 
 /* guru interface: requires care in alignment, r - i, etcetera. */
 void X(execute_split_dft_c2r)(const X(plan) p, R *ri, R *ii, R *out)
-WITH_ALIGNED_STACK({
+{
      plan_rdft2 *pln = (plan_rdft2 *) p->pln;
      problem_rdft2 *prb = (problem_rdft2 *) p->prb;
      pln->apply((plan *) pln, out, out + (prb->r1 - prb->r0), ri, ii);
-})
+}
diff --git a/api/execute-split-dft-r2c.c b/api/execute-split-dft-r2c.c
@@ -23,8 +23,8 @@
 
 /* guru interface: requires care in alignment, r - i, etcetera. */
 void X(execute_split_dft_r2c)(const X(plan) p, R *in, R *ro, R *io)
-WITH_ALIGNED_STACK({
+{
      plan_rdft2 *pln = (plan_rdft2 *) p->pln;
      problem_rdft2 *prb = (problem_rdft2 *) p->prb;
      pln->apply((plan *) pln, in, in + (prb->r1 - prb->r0), ro, io);
-})
+}
diff --git a/api/execute-split-dft.c b/api/execute-split-dft.c
@@ -23,7 +23,7 @@
 
 /* guru interface: requires care in alignment, r - i, etcetera. */
 void X(execute_split_dft)(const X(plan) p, R *ri, R *ii, R *ro, R *io)
-WITH_ALIGNED_STACK({
+{
      plan_dft *pln = (plan_dft *) p->pln;
      pln->apply((plan *) pln, ri, ii, ro, io);
-})
+}
diff --git a/api/execute.c b/api/execute.c
@@ -21,7 +21,7 @@
 #include "api.h"
 
 void X(execute)(const X(plan) p)
-WITH_ALIGNED_STACK({
+{
      plan *pln = p->pln;
      pln->adt->solve(pln, p->prb);
-})
+}
diff --git a/api/f77funcs.h b/api/f77funcs.h
@@ -24,10 +24,10 @@
    compiler manglings (via redefinition of F77). */
 
 FFTW_VOIDFUNC F77(execute, EXECUTE)(X(plan) * const p)
-WITH_ALIGNED_STACK({
+{
      plan *pln = (*p)->pln;
      pln->adt->solve(pln, (*p)->prb);
-})
+}
 
 FFTW_VOIDFUNC F77(destroy_plan, DESTROY_PLAN)(X(plan) *p)
 {
@@ -167,20 +167,20 @@ FFTW_VOIDFUNC F77(plan_guru_split_dft, PLAN_GURU_SPLIT_DFT)(X(plan) *p, int *ran
 }
 
 FFTW_VOIDFUNC F77(execute_dft, EXECUTE_DFT)(X(plan) * const p, C *in, C *out)
-WITH_ALIGNED_STACK({
+{
      plan_dft *pln = (plan_dft *) (*p)->pln;
      if ((*p)->sign == FFT_SIGN)
           pln->apply((plan *) pln, in[0], in[0]+1, out[0], out[0]+1);
      else
           pln->apply((plan *) pln, in[0]+1, in[0], out[0]+1, out[0]);
-})
+}
 
 FFTW_VOIDFUNC F77(execute_split_dft, EXECUTE_SPLIT_DFT)(X(plan) * const p,
 					       R *ri, R *ii, R *ro, R *io)
-WITH_ALIGNED_STACK({
+{
      plan_dft *pln = (plan_dft *) (*p)->pln;
      pln->apply((plan *) pln, ri, ii, ro, io);
-})
+}
 
 /****************************** DFT r2c *********************************/
 
@@ -262,19 +262,19 @@ FFTW_VOIDFUNC F77(plan_guru_split_dft_r2c, PLAN_GURU_SPLIT_DFT_R2C)(
 }
 
 FFTW_VOIDFUNC F77(execute_dft_r2c, EXECUTE_DFT_R2C)(X(plan) * const p, R *in, C *out)
-WITH_ALIGNED_STACK({
+{
      plan_rdft2 *pln = (plan_rdft2 *) (*p)->pln;
      problem_rdft2 *prb = (problem_rdft2 *) (*p)->prb;
      pln->apply((plan *) pln, in, in + (prb->r1 - prb->r0), out[0], out[0]+1);
-})
+}
 
 FFTW_VOIDFUNC F77(execute_split_dft_r2c, EXECUTE_SPLIT_DFT_R2C)(X(plan) * const p,
 						       R *in, R *ro, R *io)
-WITH_ALIGNED_STACK({
+{
      plan_rdft2 *pln = (plan_rdft2 *) (*p)->pln;
      problem_rdft2 *prb = (problem_rdft2 *) (*p)->prb;
      pln->apply((plan *) pln, in, in + (prb->r1 - prb->r0), ro, io);
-})
+}
 
 /****************************** DFT c2r *********************************/
 
@@ -356,19 +356,19 @@ FFTW_VOIDFUNC F77(plan_guru_split_dft_c2r, PLAN_GURU_SPLIT_DFT_C2R)(
 }
 
 FFTW_VOIDFUNC F77(execute_dft_c2r, EXECUTE_DFT_C2R)(X(plan) * const p, C *in, R *out)
-WITH_ALIGNED_STACK({
+{
      plan_rdft2 *pln = (plan_rdft2 *) (*p)->pln;
      problem_rdft2 *prb = (problem_rdft2 *) (*p)->prb;
      pln->apply((plan *) pln, out, out + (prb->r1 - prb->r0), in[0], in[0]+1);
-})
+}
 
 FFTW_VOIDFUNC F77(execute_split_dft_c2r, EXECUTE_SPLIT_DFT_C2R)(X(plan) * const p,
 					   R *ri, R *ii, R *out)
-WITH_ALIGNED_STACK({
+{
      plan_rdft2 *pln = (plan_rdft2 *) (*p)->pln;
      problem_rdft2 *prb = (problem_rdft2 *) (*p)->prb;
      pln->apply((plan *) pln, out, out + (prb->r1 - prb->r0), ri, ii);
-})
+}
 
 /****************************** r2r *********************************/
 
@@ -447,7 +447,7 @@ FFTW_VOIDFUNC F77(plan_guru_r2r, PLAN_GURU_R2R)(
 }
 
 FFTW_VOIDFUNC F77(execute_r2r, EXECUTE_R2R)(X(plan) * const p, R *in, R *out)
-WITH_ALIGNED_STACK({
+{
      plan_rdft *pln = (plan_rdft *) (*p)->pln;
      pln->apply((plan *) pln, in, out);
-})
+}
diff --git a/doc/Makefile.am b/doc/Makefile.am
@@ -5,8 +5,8 @@ fftw3_TEXINFOS = acknowledgements.texi cindex.texi fftw3.texi findex.texi instal
 
 DVIPS = dvips -Pwww
 
-fftw3.dvi: rfftwnd.eps
-fftw3.pdf: rfftwnd.pdf
+fftw3.dvi:: rfftwnd.eps
+fftw3.pdf:: rfftwnd.pdf
 
 rfftwnd.eps: rfftwnd.fig
 	fig2dev -L eps -m .7 ${srcdir}/rfftwnd.fig rfftwnd.eps

diff --git a/doc/fftw3.texi b/doc/fftw3.texi
@@ -1,5 +1,5 @@
 \input texinfo    @c -*-texinfo-*-
-@c Update by C-x C-e on: (texinfo-multiple-files-update "fftw3.texi")
+@c Update by C-x C-e on: (texinfo-multiple-files-update "fftw3.texi" nil t)
 @setfilename fftw3.info
 @include version.texi
 @settitle FFTW @value{VERSION}

diff --git a/doc/intro.texi b/doc/intro.texi
@@ -147,9 +147,10 @@ references.
 
 The rest of this manual is organized as follows.  We first discuss the
 sequential (single-processor) implementation.  We start by describing
-the basic interface/features of FFTW in @ref{Tutorial}.  The following
-chapter discusses @ref{Other Important Topics}, including @ref{Data
-Alignment}, the storage scheme of multi-dimensional arrays
+the basic interface/features of FFTW in @ref{Tutorial}.  
+Next, @ref{Other Important Topics} discusses data alignment
+(@pxref{SIMD alignment and fftw_malloc}),
+the storage scheme of multi-dimensional arrays
 (@pxref{Multi-dimensional Array Format}), and FFTW's mechanism for
 storing plans on disk (@pxref{Words of Wisdom-Saving Plans}).  Next,
 @ref{FFTW Reference} provides comprehensive documentation of all

diff --git a/doc/legacy-fortran.texi b/doc/legacy-fortran.texi
@@ -85,7 +85,7 @@ Fortran?}.
 
 @item
 Legacy Fortran cannot use the @code{fftw_malloc} dynamic-allocation routine.
-If you want to exploit the SIMD FFTW (@pxref{Data Alignment}), you'll
+If you want to exploit the SIMD FFTW (@pxref{SIMD alignment and fftw_malloc}), you'll
 need to figure out some other way to ensure that your arrays are at
 least 16-byte aligned.
 

diff --git a/doc/other.texi b/doc/other.texi
@@ -1,30 +1,15 @@
 @node Other Important Topics, FFTW Reference, Tutorial, Top
 @chapter Other Important Topics
 @menu
-* Data Alignment::              
+* SIMD alignment and fftw_malloc::  
 * Multi-dimensional Array Format::  
 * Words of Wisdom-Saving Plans::  
 * Caveats in Using Wisdom::     
 @end menu
 
 @c ------------------------------------------------------------
-@node Data Alignment, Multi-dimensional Array Format, Other Important Topics, Other Important Topics
-@section Data Alignment
-@cindex alignment
-
-@menu
-* SIMD alignment and fftw_malloc::  
-* Stack alignment on x86::      
-@end menu
-
-In order to get the best performance from FFTW, one needs to be
-somewhat aware of two problems related to data alignment on x86
-(Pentia) architectures: alignment of allocated arrays (for use with
-SIMD acceleration), and alignment of the stack.
-
-@c =========>
-@node SIMD alignment and fftw_malloc, Stack alignment on x86, Data Alignment, Data Alignment
-@subsection SIMD alignment and fftw_malloc
+@node SIMD alignment and fftw_malloc, Multi-dimensional Array Format, Other Important Topics, Other Important Topics
+@section SIMD alignment and fftw_malloc
 
 SIMD, which stands for ``Single Instruction Multiple Data,'' is a set of
 special operations supported by some processors to perform a single
@@ -41,7 +26,6 @@ SIMD instructions on any of these systems.
 @cindex MIPS PS
 @cindex precision
 
-
 A program linking to an FFTW library compiled with SIMD support can
 obtain a nonnegligible speedup for most complex and r2c/c2r
 transforms.  In order to obtain this speedup, however, the arrays of
@@ -69,39 +53,8 @@ happens not to be properly aligned, FFTW will not use the SIMD
 extensions.
 @cindex C++
 
-@c =========>
-@node Stack alignment on x86,  , SIMD alignment and fftw_malloc, Data Alignment
-@subsection Stack alignment on x86
-
-On the Pentium and subsequent x86 processors, there is a substantial
-performance penalty if double-precision variables are not stored
-8-byte aligned; a factor of two or more is not unusual.
-Unfortunately, the stack (the place that local variables and
-subroutine arguments live) is not guaranteed by the Intel ABI to be
-8-byte aligned.
-
-Recent versions of @code{gcc} (as well as most other compilers, we are
-told, such as Intel's, Metrowerks', and Microsoft's) are able to keep
-the stack 8-byte aligned; @code{gcc} does this by default (see
-@code{-mpreferred-stack-boundary} in the @code{gcc} documentation).
-If you are not certain whether your compiler maintains stack alignment
-by default, it is a good idea to make sure.
-
-Unfortunately, @code{gcc} only @emph{preserves} the stack
-alignment---as a result, if the stack starts off misaligned, it will
-always be misaligned, with a disastrous effect on performance (in
-double precision).  To prevent this, FFTW includes hacks to align its
-own stack if necessary, so it should perform well even if you call it
-from a program with a misaligned stack.  Currently, our hacks support
-@code{gcc} and the Intel C compiler; if you use another compiler you
-are on your own.  Fortunately, recent versions of glibc (on GNU/Linux)
-provide a properly-aligned starting stack, but this was not the case
-with a number of older versions, and we are not certain of the
-situation on other operating systems.  Hopefully, as time goes by this
-will become less of a concern.
-
 @c ------------------------------------------------------------
-@node Multi-dimensional Array Format, Words of Wisdom-Saving Plans, Data Alignment, Other Important Topics
+@node Multi-dimensional Array Format, Words of Wisdom-Saving Plans, SIMD alignment and fftw_malloc, Other Important Topics
 @section Multi-dimensional Array Format
 
 This section describes the format in which multi-dimensional arrays

diff --git a/doc/reference.texi b/doc/reference.texi
@@ -143,7 +143,7 @@ void fftw_free(void *p);
 These are functions that behave identically to @code{malloc} and
 @code{free}, except that they guarantee that the returned pointer obeys
 any special alignment restrictions imposed by any algorithm in FFTW
-(e.g. for SIMD acceleration).  @xref{Data Alignment}.
+(e.g. for SIMD acceleration).  @xref{SIMD alignment and fftw_malloc}.
 @cindex alignment
 
 

diff --git a/doc/tutorial.texi b/doc/tutorial.texi
@@ -851,6 +851,11 @@ most useful.
 @node The Discrete Hartley Transform,  , Real even/odd DFTs (cosine/sine transforms), More DFTs of Real Data
 @subsection The Discrete Hartley Transform
 
+If you are planning to use the DHT because you've heard that it is
+``faster'' than the DFT (FFT), @strong{stop here}.  The DHT is not
+faster than the DFT.  That story is an old but enduring misconception
+that was debunked in 1987.
+
 The discrete Hartley transform (DHT) is an invertible linear transform
 closely related to the DFT.  In the DFT, one multiplies each input by
 @math{cos - i * sin} (a complex exponential), whereas in the DHT each
@@ -862,20 +867,11 @@ positive @code{n}) can be specified by an r2r kind of @code{FFTW_DHT}.
 @cindex discrete Hartley transform
 @cindex DHT
 
-
-If you are planning to use the DHT because you've heard that it is
-``faster'' than the DFT (FFT), @strong{stop here}.  That story is an old
-but enduring misconception that was debunked in 1987: a properly
-designed real-input FFT (such as FFTW's) has no more operations in
-general than an FHT.  Moreover, in FFTW, the DHT is ordinarily
-@emph{slower} than the DFT for composite sizes (see below).
-
 Like the DFT, in FFTW the DHT is unnormalized, so computing a DHT of
 size @code{n} followed by another DHT of the same size will result in
 the original array multiplied by @code{n}.
 @cindex normalization
 
-
 The DHT was originally proposed as a more efficient alternative to the
 DFT for real data, but it was subsequently shown that a specialized DFT
 (such as FFTW's r2hc or r2c transforms) could be just as fast.  In FFTW,
Original file line number	Diff line number	Diff line change
Expand Up		@@ -37,5 +37,3 @@ TODO before FFTW-$2\pi$:
		to allow SIMD to be used.

		* memoize triggen.

		* eliminate alignment hacks, which ought to be obsolete by now.