From de904f19b230a114ead0b9580646689ab8519a29 Mon Sep 17 00:00:00 2001 From: stevenj Date: Tue, 19 Dec 2006 17:07:04 -0500 Subject: [PATCH] added memcpy-loop rank0 solver (it makes a 5-20% difference for transposes of large tuples) [empty commit message] --- rdft/rank0.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/rdft/rank0.c b/rdft/rank0.c index 9ad878d86..375dbaf30 100644 --- a/rdft/rank0.c +++ b/rdft/rank0.c @@ -209,6 +209,37 @@ static int applicable_memcpy(const P *pln, const problem_rdft *p) ); } +/**************************************************************/ +/* rank > 0 vecloop, out of place, using memcpy (e.g. out-of-place + transposes of vl-tuples ... for large vl it should be more + efficient to use memcpy than the tiled stuff). */ + +static void memcpy_loop(INT cpysz, int rnk, const iodim *d, R *I, R *O) +{ + INT i, n = d->n, is = d->is, os = d->os; + if (rnk == 1) + for (i = 0; i < n; ++i, I += is, O += os) + memcpy(O, I, cpysz); + else { + --rnk; ++d; + for (i = 0; i < n; ++i, I += is, O += os) + memcpy_loop(cpysz, rnk, d, I, O); + } +} + +static void apply_memcpy_loop(const plan *ego_, R *I, R *O) +{ + const P *ego = (const P *) ego_; + memcpy_loop(ego->vl * sizeof(R), ego->rnk, ego->d, I, O); +} + +static int applicable_memcpy_loop(const P *pln, const problem_rdft *p) +{ + return (p->I != p->O + && pln->rnk > 0 + && pln->vl > 2 /* do not bother memcpy-ing complex numbers */); +} + /**************************************************************/ /* rank 2, in place, square transpose, iterative */ static void apply_ip_sq(const plan *ego_, R *I, R *O) @@ -319,6 +350,8 @@ void X(rdft_rank0_register)(planner *p) const char *nam; } tab[] = { { apply_memcpy, applicable_memcpy, "rdft-rank0-memcpy" }, + { apply_memcpy_loop, applicable_memcpy_loop, + "rdft-rank0-memcpy-loop" }, { apply_iter, applicable_iter, "rdft-rank0-iter-ci" }, { apply_cpy2dco, applicable_cpy2dco, "rdft-rank0-iter-co" }, { apply_tiled, applicable_tiled, "rdft-rank0-tiled" },