Skip to content

Commit

Permalink
eliminate the WITH_ALIGNED_STACK hack
Browse files Browse the repository at this point in the history
This is 2011 and I have no system with incorrect stack alignment.
  • Loading branch information
matteo-frigo committed Jun 24, 2011
1 parent 7e32fb6 commit ba838fa
Show file tree
Hide file tree
Showing 24 changed files with 64 additions and 172 deletions.
2 changes: 0 additions & 2 deletions TODO
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,3 @@ TODO before FFTW-$2\pi$:
to allow SIMD to be used.

* memoize triggen.

* eliminate alignment hacks, which ought to be obsolete by now.
13 changes: 4 additions & 9 deletions api/apiplan.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
static plan *mkplan0(planner *plnr, unsigned flags,
const problem *prb, int hash_info,
wisdom_state_t wisdom_state)
WITH_ALIGNED_STACK({
{
/* map API flags into FFTW flags */
X(mapflags)(plnr, flags);

Expand All @@ -32,12 +32,7 @@ WITH_ALIGNED_STACK({

/* create plan */
return plnr->adt->mkplan(plnr, prb);
})

static void aligned_awake(plan *ego, enum wakefulness wakefulness)
WITH_ALIGNED_STACK({
X(plan_awake)(ego, wakefulness);
})
}

static unsigned force_estimator(unsigned flags)
{
Expand Down Expand Up @@ -143,10 +138,10 @@ apiplan *X(mkapiplan)(int sign, unsigned flags, problem *prb)
if (sizeof(trigreal) > sizeof(R)) {
/* this is probably faster, and we have enough trigreal
bits to maintain accuracy */
aligned_awake(p->pln, AWAKE_SQRTN_TABLE);
X(plan_awake)(p->pln, AWAKE_SQRTN_TABLE);
} else {
/* more accurate */
aligned_awake(p->pln, AWAKE_SINCOS);
X(plan_awake)(p->pln, AWAKE_SINCOS);
}

/* we don't use pln for p->pln, above, since by re-creating the
Expand Down
4 changes: 2 additions & 2 deletions api/execute-dft-c2r.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@

/* guru interface: requires care in alignment, r - i, etcetera. */
void X(execute_dft_c2r)(const X(plan) p, C *in, R *out)
WITH_ALIGNED_STACK({
{
plan_rdft2 *pln = (plan_rdft2 *) p->pln;
problem_rdft2 *prb = (problem_rdft2 *) p->prb;
pln->apply((plan *) pln, out, out + (prb->r1 - prb->r0), in[0], in[0]+1);
})
}
4 changes: 2 additions & 2 deletions api/execute-dft-r2c.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@

/* guru interface: requires care in alignment, r - i, etcetera. */
void X(execute_dft_r2c)(const X(plan) p, R *in, C *out)
WITH_ALIGNED_STACK({
{
plan_rdft2 *pln = (plan_rdft2 *) p->pln;
problem_rdft2 *prb = (problem_rdft2 *) p->prb;
pln->apply((plan *) pln, in, in + (prb->r1 - prb->r0), out[0], out[0]+1);
})
}
4 changes: 2 additions & 2 deletions api/execute-dft.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@

/* guru interface: requires care in alignment etcetera. */
void X(execute_dft)(const X(plan) p, C *in, C *out)
WITH_ALIGNED_STACK({
{
plan_dft *pln = (plan_dft *) p->pln;
if (p->sign == FFT_SIGN)
pln->apply((plan *) pln, in[0], in[0]+1, out[0], out[0]+1);
else
pln->apply((plan *) pln, in[0]+1, in[0], out[0]+1, out[0]);
})
}
4 changes: 2 additions & 2 deletions api/execute-r2r.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

/* guru interface: requires care in alignment, etcetera. */
void X(execute_r2r)(const X(plan) p, R *in, R *out)
WITH_ALIGNED_STACK({
{
plan_rdft *pln = (plan_rdft *) p->pln;
pln->apply((plan *) pln, in, out);
})
}
4 changes: 2 additions & 2 deletions api/execute-split-dft-c2r.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@

/* guru interface: requires care in alignment, r - i, etcetera. */
void X(execute_split_dft_c2r)(const X(plan) p, R *ri, R *ii, R *out)
WITH_ALIGNED_STACK({
{
plan_rdft2 *pln = (plan_rdft2 *) p->pln;
problem_rdft2 *prb = (problem_rdft2 *) p->prb;
pln->apply((plan *) pln, out, out + (prb->r1 - prb->r0), ri, ii);
})
}
4 changes: 2 additions & 2 deletions api/execute-split-dft-r2c.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@

/* guru interface: requires care in alignment, r - i, etcetera. */
void X(execute_split_dft_r2c)(const X(plan) p, R *in, R *ro, R *io)
WITH_ALIGNED_STACK({
{
plan_rdft2 *pln = (plan_rdft2 *) p->pln;
problem_rdft2 *prb = (problem_rdft2 *) p->prb;
pln->apply((plan *) pln, in, in + (prb->r1 - prb->r0), ro, io);
})
}
4 changes: 2 additions & 2 deletions api/execute-split-dft.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

/* guru interface: requires care in alignment, r - i, etcetera. */
void X(execute_split_dft)(const X(plan) p, R *ri, R *ii, R *ro, R *io)
WITH_ALIGNED_STACK({
{
plan_dft *pln = (plan_dft *) p->pln;
pln->apply((plan *) pln, ri, ii, ro, io);
})
}
4 changes: 2 additions & 2 deletions api/execute.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#include "api.h"

void X(execute)(const X(plan) p)
WITH_ALIGNED_STACK({
{
plan *pln = p->pln;
pln->adt->solve(pln, p->prb);
})
}
32 changes: 16 additions & 16 deletions api/f77funcs.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@
compiler manglings (via redefinition of F77). */

FFTW_VOIDFUNC F77(execute, EXECUTE)(X(plan) * const p)
WITH_ALIGNED_STACK({
{
plan *pln = (*p)->pln;
pln->adt->solve(pln, (*p)->prb);
})
}

FFTW_VOIDFUNC F77(destroy_plan, DESTROY_PLAN)(X(plan) *p)
{
Expand Down Expand Up @@ -167,20 +167,20 @@ FFTW_VOIDFUNC F77(plan_guru_split_dft, PLAN_GURU_SPLIT_DFT)(X(plan) *p, int *ran
}

FFTW_VOIDFUNC F77(execute_dft, EXECUTE_DFT)(X(plan) * const p, C *in, C *out)
WITH_ALIGNED_STACK({
{
plan_dft *pln = (plan_dft *) (*p)->pln;
if ((*p)->sign == FFT_SIGN)
pln->apply((plan *) pln, in[0], in[0]+1, out[0], out[0]+1);
else
pln->apply((plan *) pln, in[0]+1, in[0], out[0]+1, out[0]);
})
}

FFTW_VOIDFUNC F77(execute_split_dft, EXECUTE_SPLIT_DFT)(X(plan) * const p,
R *ri, R *ii, R *ro, R *io)
WITH_ALIGNED_STACK({
{
plan_dft *pln = (plan_dft *) (*p)->pln;
pln->apply((plan *) pln, ri, ii, ro, io);
})
}

/****************************** DFT r2c *********************************/

Expand Down Expand Up @@ -262,19 +262,19 @@ FFTW_VOIDFUNC F77(plan_guru_split_dft_r2c, PLAN_GURU_SPLIT_DFT_R2C)(
}

FFTW_VOIDFUNC F77(execute_dft_r2c, EXECUTE_DFT_R2C)(X(plan) * const p, R *in, C *out)
WITH_ALIGNED_STACK({
{
plan_rdft2 *pln = (plan_rdft2 *) (*p)->pln;
problem_rdft2 *prb = (problem_rdft2 *) (*p)->prb;
pln->apply((plan *) pln, in, in + (prb->r1 - prb->r0), out[0], out[0]+1);
})
}

FFTW_VOIDFUNC F77(execute_split_dft_r2c, EXECUTE_SPLIT_DFT_R2C)(X(plan) * const p,
R *in, R *ro, R *io)
WITH_ALIGNED_STACK({
{
plan_rdft2 *pln = (plan_rdft2 *) (*p)->pln;
problem_rdft2 *prb = (problem_rdft2 *) (*p)->prb;
pln->apply((plan *) pln, in, in + (prb->r1 - prb->r0), ro, io);
})
}

/****************************** DFT c2r *********************************/

Expand Down Expand Up @@ -356,19 +356,19 @@ FFTW_VOIDFUNC F77(plan_guru_split_dft_c2r, PLAN_GURU_SPLIT_DFT_C2R)(
}

FFTW_VOIDFUNC F77(execute_dft_c2r, EXECUTE_DFT_C2R)(X(plan) * const p, C *in, R *out)
WITH_ALIGNED_STACK({
{
plan_rdft2 *pln = (plan_rdft2 *) (*p)->pln;
problem_rdft2 *prb = (problem_rdft2 *) (*p)->prb;
pln->apply((plan *) pln, out, out + (prb->r1 - prb->r0), in[0], in[0]+1);
})
}

FFTW_VOIDFUNC F77(execute_split_dft_c2r, EXECUTE_SPLIT_DFT_C2R)(X(plan) * const p,
R *ri, R *ii, R *out)
WITH_ALIGNED_STACK({
{
plan_rdft2 *pln = (plan_rdft2 *) (*p)->pln;
problem_rdft2 *prb = (problem_rdft2 *) (*p)->prb;
pln->apply((plan *) pln, out, out + (prb->r1 - prb->r0), ri, ii);
})
}

/****************************** r2r *********************************/

Expand Down Expand Up @@ -447,7 +447,7 @@ FFTW_VOIDFUNC F77(plan_guru_r2r, PLAN_GURU_R2R)(
}

FFTW_VOIDFUNC F77(execute_r2r, EXECUTE_R2R)(X(plan) * const p, R *in, R *out)
WITH_ALIGNED_STACK({
{
plan_rdft *pln = (plan_rdft *) (*p)->pln;
pln->apply((plan *) pln, in, out);
})
}
4 changes: 2 additions & 2 deletions doc/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ fftw3_TEXINFOS = acknowledgements.texi cindex.texi fftw3.texi findex.texi instal

DVIPS = dvips -Pwww

fftw3.dvi: rfftwnd.eps
fftw3.pdf: rfftwnd.pdf
fftw3.dvi:: rfftwnd.eps
fftw3.pdf:: rfftwnd.pdf

rfftwnd.eps: rfftwnd.fig
fig2dev -L eps -m .7 ${srcdir}/rfftwnd.fig rfftwnd.eps
Expand Down
2 changes: 1 addition & 1 deletion doc/fftw3.texi
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
\input texinfo @c -*-texinfo-*-
@c Update by C-x C-e on: (texinfo-multiple-files-update "fftw3.texi")
@c Update by C-x C-e on: (texinfo-multiple-files-update "fftw3.texi" nil t)
@setfilename fftw3.info
@include version.texi
@settitle FFTW @value{VERSION}
Expand Down
7 changes: 4 additions & 3 deletions doc/intro.texi
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,10 @@ references.

The rest of this manual is organized as follows. We first discuss the
sequential (single-processor) implementation. We start by describing
the basic interface/features of FFTW in @ref{Tutorial}. The following
chapter discusses @ref{Other Important Topics}, including @ref{Data
Alignment}, the storage scheme of multi-dimensional arrays
the basic interface/features of FFTW in @ref{Tutorial}.
Next, @ref{Other Important Topics} discusses data alignment
(@pxref{SIMD alignment and fftw_malloc}),
the storage scheme of multi-dimensional arrays
(@pxref{Multi-dimensional Array Format}), and FFTW's mechanism for
storing plans on disk (@pxref{Words of Wisdom-Saving Plans}). Next,
@ref{FFTW Reference} provides comprehensive documentation of all
Expand Down
2 changes: 1 addition & 1 deletion doc/legacy-fortran.texi
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ Fortran?}.

@item
Legacy Fortran cannot use the @code{fftw_malloc} dynamic-allocation routine.
If you want to exploit the SIMD FFTW (@pxref{Data Alignment}), you'll
If you want to exploit the SIMD FFTW (@pxref{SIMD alignment and fftw_malloc}), you'll
need to figure out some other way to ensure that your arrays are at
least 16-byte aligned.

Expand Down
55 changes: 4 additions & 51 deletions doc/other.texi
Original file line number Diff line number Diff line change
@@ -1,30 +1,15 @@
@node Other Important Topics, FFTW Reference, Tutorial, Top
@chapter Other Important Topics
@menu
* Data Alignment::
* SIMD alignment and fftw_malloc::
* Multi-dimensional Array Format::
* Words of Wisdom-Saving Plans::
* Caveats in Using Wisdom::
@end menu

@c ------------------------------------------------------------
@node Data Alignment, Multi-dimensional Array Format, Other Important Topics, Other Important Topics
@section Data Alignment
@cindex alignment

@menu
* SIMD alignment and fftw_malloc::
* Stack alignment on x86::
@end menu

In order to get the best performance from FFTW, one needs to be
somewhat aware of two problems related to data alignment on x86
(Pentia) architectures: alignment of allocated arrays (for use with
SIMD acceleration), and alignment of the stack.

@c =========>
@node SIMD alignment and fftw_malloc, Stack alignment on x86, Data Alignment, Data Alignment
@subsection SIMD alignment and fftw_malloc
@node SIMD alignment and fftw_malloc, Multi-dimensional Array Format, Other Important Topics, Other Important Topics
@section SIMD alignment and fftw_malloc

SIMD, which stands for ``Single Instruction Multiple Data,'' is a set of
special operations supported by some processors to perform a single
Expand All @@ -41,7 +26,6 @@ SIMD instructions on any of these systems.
@cindex MIPS PS
@cindex precision


A program linking to an FFTW library compiled with SIMD support can
obtain a nonnegligible speedup for most complex and r2c/c2r
transforms. In order to obtain this speedup, however, the arrays of
Expand Down Expand Up @@ -69,39 +53,8 @@ happens not to be properly aligned, FFTW will not use the SIMD
extensions.
@cindex C++

@c =========>
@node Stack alignment on x86, , SIMD alignment and fftw_malloc, Data Alignment
@subsection Stack alignment on x86

On the Pentium and subsequent x86 processors, there is a substantial
performance penalty if double-precision variables are not stored
8-byte aligned; a factor of two or more is not unusual.
Unfortunately, the stack (the place that local variables and
subroutine arguments live) is not guaranteed by the Intel ABI to be
8-byte aligned.

Recent versions of @code{gcc} (as well as most other compilers, we are
told, such as Intel's, Metrowerks', and Microsoft's) are able to keep
the stack 8-byte aligned; @code{gcc} does this by default (see
@code{-mpreferred-stack-boundary} in the @code{gcc} documentation).
If you are not certain whether your compiler maintains stack alignment
by default, it is a good idea to make sure.

Unfortunately, @code{gcc} only @emph{preserves} the stack
alignment---as a result, if the stack starts off misaligned, it will
always be misaligned, with a disastrous effect on performance (in
double precision). To prevent this, FFTW includes hacks to align its
own stack if necessary, so it should perform well even if you call it
from a program with a misaligned stack. Currently, our hacks support
@code{gcc} and the Intel C compiler; if you use another compiler you
are on your own. Fortunately, recent versions of glibc (on GNU/Linux)
provide a properly-aligned starting stack, but this was not the case
with a number of older versions, and we are not certain of the
situation on other operating systems. Hopefully, as time goes by this
will become less of a concern.

@c ------------------------------------------------------------
@node Multi-dimensional Array Format, Words of Wisdom-Saving Plans, Data Alignment, Other Important Topics
@node Multi-dimensional Array Format, Words of Wisdom-Saving Plans, SIMD alignment and fftw_malloc, Other Important Topics
@section Multi-dimensional Array Format

This section describes the format in which multi-dimensional arrays
Expand Down
2 changes: 1 addition & 1 deletion doc/reference.texi
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ void fftw_free(void *p);
These are functions that behave identically to @code{malloc} and
@code{free}, except that they guarantee that the returned pointer obeys
any special alignment restrictions imposed by any algorithm in FFTW
(e.g. for SIMD acceleration). @xref{Data Alignment}.
(e.g. for SIMD acceleration). @xref{SIMD alignment and fftw_malloc}.
@cindex alignment


Expand Down
14 changes: 5 additions & 9 deletions doc/tutorial.texi
Original file line number Diff line number Diff line change
Expand Up @@ -851,6 +851,11 @@ most useful.
@node The Discrete Hartley Transform, , Real even/odd DFTs (cosine/sine transforms), More DFTs of Real Data
@subsection The Discrete Hartley Transform

If you are planning to use the DHT because you've heard that it is
``faster'' than the DFT (FFT), @strong{stop here}. The DHT is not
faster than the DFT. That story is an old but enduring misconception
that was debunked in 1987.

The discrete Hartley transform (DHT) is an invertible linear transform
closely related to the DFT. In the DFT, one multiplies each input by
@math{cos - i * sin} (a complex exponential), whereas in the DHT each
Expand All @@ -862,20 +867,11 @@ positive @code{n}) can be specified by an r2r kind of @code{FFTW_DHT}.
@cindex discrete Hartley transform
@cindex DHT


If you are planning to use the DHT because you've heard that it is
``faster'' than the DFT (FFT), @strong{stop here}. That story is an old
but enduring misconception that was debunked in 1987: a properly
designed real-input FFT (such as FFTW's) has no more operations in
general than an FHT. Moreover, in FFTW, the DHT is ordinarily
@emph{slower} than the DFT for composite sizes (see below).

Like the DFT, in FFTW the DHT is unnormalized, so computing a DHT of
size @code{n} followed by another DHT of the same size will result in
the original array multiplied by @code{n}.
@cindex normalization


The DHT was originally proposed as a more efficient alternative to the
DFT for real data, but it was subsequently shown that a specialized DFT
(such as FFTW's r2hc or r2c transforms) could be just as fast. In FFTW,
Expand Down
Loading

0 comments on commit ba838fa

Please sign in to comment.