From 0feef43e989d8e8222704c8fbc0f383eabb4e94a Mon Sep 17 00:00:00 2001 From: John Bowman Date: Sat, 27 Aug 2022 00:32:17 -0600 Subject: [PATCH] Implement running median in statistics.h; use median time for optimization; optionally normalize within mult routine. --- statistics.h | 38 ++++++++++++++++++++++++++++++++++++-- tests/cconv.cc | 4 ++-- tests/conv.cc | 2 +- tests/convolve.cc | 27 ++++++++++++++------------- tests/convolve.h | 8 ++++---- tests/explicit.cc | 27 ++++++++++++++++++--------- tests/explicit.h | 6 +++++- tests/hybridconv.cc | 18 +++++++++++++++++- tests/hybridconvh.cc | 18 +++++++++++++++++- tests/timing.h | 21 +++++++++------------ tests/timing.py | 2 +- 11 files changed, 124 insertions(+), 47 deletions(-) diff --git a/statistics.h b/statistics.h index 4eed7f79..b8d00735 100644 --- a/statistics.h +++ b/statistics.h @@ -2,6 +2,7 @@ #define __statistics_h__ 1 #include +#include namespace utils { @@ -12,9 +13,11 @@ class statistics { double varH; double m; // min double M; // max + std::list T; // Only used if constructed with median=true + bool Median; public: - statistics() {clear();} - void clear() {N=0; A=varL=varH=0.0; m=DBL_MAX; M=-m;} + statistics(bool median=false) : Median(median) {clear();} + void clear() {N=0; A=varL=varH=0.0; m=DBL_MAX; M=-m; T.clear();} double count() {return N;} double mean() {return A;} double min() {return m;} @@ -30,6 +33,27 @@ class statistics { varL += v; else varH += v; + + if(Median) { + auto p=T.begin(); + if(N == 1 || t <= *p) T.push_front(t); + else { + size_t l=0; + size_t u=N-1; + ssize_t last=0; + while(l < u) { + ssize_t i=(l+u)/2; + std::advance(p,i-last); + last=i; + if(t < *p) u=i; + else { + ++p; + if(p == T.end() || t <= *p) {T.insert(p,t); break;} + last=l=i+1; + } + } + } + } } double stdev(double var, double f) { double factor=N > f ? f/(N-f) : 0.0; @@ -44,6 +68,16 @@ class statistics { double stdevH() { return stdev(varH,2.0); } + double median() { + if(!Median) { + std::cerr << "Constructor requires median=true" << std::endl; + exit(-1); + } + unsigned int h=N/2; + auto p=T.begin(); + std::advance(p,h-1); + return 2*h == N ? 0.5*(*p+*(++p)) : *(++p); + } void output(const char *text, unsigned int m) { std::cout << text << ":\n" << m << "\t" diff --git a/tests/cconv.cc b/tests/cconv.cc index fa34ebb0..4e06f82e 100644 --- a/tests/cconv.cc +++ b/tests/cconv.cc @@ -219,7 +219,7 @@ int main(int argc, char* argv[]) timings("Implicit",m,T,N,stats); - if(m < 100) { + if(m < 100 && false) { for(unsigned int b=0; b < B; ++b) { for(unsigned int i=0; i < m; i++) cout << F[b][i] << endl; @@ -252,7 +252,7 @@ int main(int argc, char* argv[]) cout << endl; timings("Explicit",m,T,N,stats); - if(m < 100) + if(m < 100 && false) for(unsigned int i=0; i < m; i++) cout << F[0][i] << endl; else cout << F[0][0] << endl; diff --git a/tests/conv.cc b/tests/conv.cc index 3b01d3dc..14839938 100644 --- a/tests/conv.cc +++ b/tests/conv.cc @@ -279,7 +279,7 @@ int main(int argc, char* argv[]) cout << endl; timings("Explicit",2*m-1,T,N,stats); - if(m < 100 && false) + if(m < 100) for(unsigned int i=0; i < m; i++) cout << f[i] << endl; else diff --git a/tests/convolve.cc b/tests/convolve.cc index f1814d7f..0b4505e4 100644 --- a/tests/convolve.cc +++ b/tests/convolve.cc @@ -227,6 +227,7 @@ void fftBase::OptBase::check(unsigned int L, unsigned int M, if(q == 1 || valid(D,p,S)) { // cout << "D=" << D << ", m=" << m << endl; double t=time(L,M,C,S,m,q,D,app); + // cout << p << " " << q << ": " << t << endl; if(t < T) { this->m=m; this->q=q; @@ -276,28 +277,28 @@ fftBase::~fftBase() deleteAlign(ZetaqmS0); } -double fftBase::mintime(Application& app, double *Stdev) +double fftBase::medianTime(Application& app, double *Stdev) { unsigned int K=1; double eps=0.1; - statistics Stats; + statistics Stats(true); app.init(*this); + static ratio r=chrono::steady_clock::period(); + double threshold=5000*r.num/r.den; while(true) { - Stats.add(app.time(*this,K)); - double mean=Stats.mean(); - double min=Stats.min(); + double t=app.time(*this,K)/K; + Stats.add(t); double stdev=Stats.stdev(); if(Stats.count() < 7) continue; - int threshold=5000; - ratio r=chrono::steady_clock::period(); - if(min*r.den < threshold*r.num || eps*mean < stdev) { + double value=Stats.median(); + if(value < threshold || eps*value < stdev) { K *= 2; Stats.clear(); } else { - if(Stdev) *Stdev=stdev/K; + if(Stdev) *Stdev=stdev; app.clear(); - return min/K; + return value; } } return 0.0; @@ -308,11 +309,11 @@ double fftBase::report(Application& app) double stdev; cout << endl; - double min=mintime(app,&stdev); + double median=medianTime(app,&stdev); - cout << "min=" << min << " stdev=" << stdev << endl; + cout << "median=" << median << " stdev=" << stdev << endl; - return min; + return median; } void fftBase::common() diff --git a/tests/convolve.h b/tests/convolve.h index 3a611eb5..a8bcbe65 100644 --- a/tests/convolve.h +++ b/tests/convolve.h @@ -387,7 +387,7 @@ class fftBase : public ThreadBase { return overwrite && A >= B; } - double mintime(Application& app, double *Stdev=NULL); + double medianTime(Application& app, double *Stdev=NULL); double report(Application& app); }; @@ -412,7 +412,7 @@ class fftPad : public fftBase { unsigned int m, unsigned int q,unsigned int D, Application &app) { fftPad fft(L,M,C,S,m,q,D,app.Threads(),false); - return fft.mintime(app); + return fft.medianTime(app); } }; @@ -512,7 +512,7 @@ class fftPadCentered : public fftPad { unsigned int m, unsigned int q, unsigned int D, Application &app) { fftPadCentered fft(L,M,C,S,m,q,D,app.Threads()); - return fft.mintime(app); + return fft.medianTime(app); } }; @@ -617,7 +617,7 @@ class fftPadHermitian : public fftBase { unsigned int m, unsigned int q, unsigned int D, Application &app) { fftPadHermitian fft(L,M,C,m,q,D,app.Threads()); - return fft.mintime(app); + return fft.medianTime(app); } }; diff --git a/tests/explicit.cc b/tests/explicit.cc index a8acc166..52a97591 100644 --- a/tests/explicit.cc +++ b/tests/explicit.cc @@ -4,6 +4,22 @@ namespace fftwpp { +// This multiplication routine is for binary convolutions and takes two inputs +// of size m. +// F[0][j] *= F[1][j]; +void multbinary(Complex **F, unsigned int m, unsigned int threads) +{ + Complex* F0=F[0]; + Complex* F1=F[1]; + + double ninv=1.0/m; + + PARALLEL( + for(unsigned int j=0; j < m; ++j) + F0[j] *= ninv*F1[j]; + ); +} + // This multiplication routine is for binary Hermitian convolutions and takes // two inputs. // F[0][j] *= F[1][j]; @@ -30,7 +46,7 @@ void ExplicitConvolution::forwards(Complex *f) Forwards->fft(f); } -void ExplicitConvolution::convolve(Complex **F, multiplier *mult) +void ExplicitConvolution::convolve(Complex **F, Multiplier *mult) { const unsigned int A=2; const unsigned int B=1; @@ -40,17 +56,10 @@ void ExplicitConvolution::convolve(Complex **F, multiplier *mult) backwards(F[a]); } - (*mult)(F,n,0,NULL,0,threads); + (*mult)(F,n,threads); for(unsigned int b=0; b < B; ++b) forwards(F[b]); - - double ninv=1.0/n; - for(unsigned int b=0; b < B; ++b) { - Complex *f=F[b]; - for(unsigned int j=0; j < m; ++j) - f[j] *= ninv; - } } void ExplicitConvolution::convolve(Complex *f, Complex *g) diff --git a/tests/explicit.h b/tests/explicit.h index bc38bfe2..84c486df 100644 --- a/tests/explicit.h +++ b/tests/explicit.h @@ -5,9 +5,13 @@ namespace fftwpp { +// Specialized mult routines for testing explicit convolutions: +typedef void Multiplier(Complex **, unsigned int m, + unsigned int threads); typedef void Realmultiplier(double **, unsigned int m, unsigned int threads); +Multiplier multbinary; Realmultiplier multbinary; class ExplicitPad : public ThreadBase { @@ -222,7 +226,7 @@ class ExplicitConvolution : public ExplicitPad { void forwards(Complex *f); // F is an array of pointers to distinct data blocks each of size n. - void convolve(Complex **F, multiplier *mult); + void convolve(Complex **F, Multiplier *mult); // Compute f (*) g. The distinct input arrays f and g are each of size n // (contents not preserved). The output is returned in f. diff --git a/tests/hybridconv.cc b/tests/hybridconv.cc index 56fab9d0..6bf4a911 100644 --- a/tests/hybridconv.cc +++ b/tests/hybridconv.cc @@ -14,6 +14,22 @@ unsigned int B=1; // number of outputs unsigned int L=512; // input data length unsigned int M=1024; // minimum padded length +// This multiplication routine is for binary convolutions and takes +// two Complex inputs of size n and outputs one Complex value. +// F0[j] *= F1[j]; +void multbinaryNormalized(Complex **F, unsigned int offset, unsigned int n, + unsigned int threads) +{ + Complex *F0=F[0]+offset; + Complex *F1=F[1]+offset; + + double ninv=1.0/n; + PARALLEL( + for(unsigned int j=0; j < n; ++j) + F0[j] *= ninv*F1[j]; + ); +} + int main(int argc, char* argv[]) { fftw::maxthreads=get_max_threads(); @@ -62,7 +78,7 @@ int main(int argc, char* argv[]) #endif for(unsigned int k=0; k < K; ++k) { seconds(); - Convolve.convolve(f,multbinary); + Convolve.convolveRaw(f,multbinaryNormalized); T[k]=seconds(); } diff --git a/tests/hybridconvh.cc b/tests/hybridconvh.cc index 823b9dc5..3db34a4b 100644 --- a/tests/hybridconvh.cc +++ b/tests/hybridconvh.cc @@ -13,6 +13,22 @@ unsigned int B=1; // number of outputs unsigned int L=512; // input data length unsigned int M=768; // minimum padded length +// This multiplication routine is for binary convolutions and takes +// two real inputs of size n. +// F0[j] *= F1[j]; +void realmultbinaryNormalized(Complex **F, unsigned int offset, unsigned int n, + unsigned int threads) +{ + double *F0=(double *) (F[0]+offset); + double *F1=(double *) (F[1]+offset); + + double ninv=1.0/n; + PARALLEL( + for(unsigned int j=0; j < n; ++j) + F0[j] *= ninv*F1[j]; + ); +} + int main(int argc, char* argv[]) { fftw::maxthreads=get_max_threads(); @@ -62,7 +78,7 @@ int main(int argc, char* argv[]) #endif for(unsigned int k=0; k < K; ++k) { seconds(); - Convolve.convolve(f,realmultbinary); + Convolve.convolveRaw(f,realmultbinaryNormalized); T[k]=seconds(); } diff --git a/tests/timing.h b/tests/timing.h index bbf60de4..8a1d7a6a 100644 --- a/tests/timing.h +++ b/tests/timing.h @@ -9,6 +9,13 @@ enum timing_algorithm {WRITETOFILE = -1, MEAN, MIN, MAX, MEDIAN, P90, P80, P50}; +inline double median(double *T, unsigned int N) // TODO: Replace with selection algorithm +{ + std::sort(T,T+N); + unsigned int h=N/2; + return 2*h == N ? 0.5*(T[h-1]+T[h]) : T[h]; +} + inline double value(double *T, unsigned int N, int algorithm) { switch(algorithm) { @@ -18,7 +25,6 @@ inline double value(double *T, unsigned int N, int algorithm) for(unsigned int i=0; i < N; ++i) sum += T[i]; return sum/N; - break; } case MIN: { double min=T[0]; @@ -27,7 +33,6 @@ inline double value(double *T, unsigned int N, int algorithm) min=T[i]; } return min; - break; } case MAX: { double max=T[0]; @@ -36,14 +41,9 @@ inline double value(double *T, unsigned int N, int algorithm) max=T[i]; } return max; - break; - } - case MEDIAN: { // TODO: Replace with selection algorithm - std::sort(T,T+N); - unsigned int h=N/2; - return 2*h == N ? (T[h-1]+T[h])/2 : T[h]; - break; } + case MEDIAN: + return median(T,N); case P90: { std::sort(T,T+N); unsigned int start=(int)ceil(N*0.05); @@ -53,7 +53,6 @@ inline double value(double *T, unsigned int N, int algorithm) for(unsigned int i=start; i < stop; ++i) sum += T[i]; return sum/(stop-start); - break; } case P80: { std::sort(T,T+N); @@ -64,7 +63,6 @@ inline double value(double *T, unsigned int N, int algorithm) for(unsigned int i=start; i < stop; ++i) sum += T[i]; return sum/(stop-start); - break; } case P50: { std::sort(T,T+N); @@ -75,7 +73,6 @@ inline double value(double *T, unsigned int N, int algorithm) for(unsigned int i=start; i < stop; ++i) sum += T[i]; return sum/(stop-start); - break; } default: std::cout << "Error: invalid algorithm choice: " diff --git a/tests/timing.py b/tests/timing.py index 6ccb4858..bfa85ebd 100755 --- a/tests/timing.py +++ b/tests/timing.py @@ -481,7 +481,7 @@ def main(argv): if dothism: if(hybrid): - mcmd=cmd+["-L"+str(L)]+["-M"+str(M)] #+["-I0"] + mcmd=cmd+["-L"+str(L)]+["-M"+str(M)] #+["-I1"] else: mcmd = cmd + ["-m" + str(m)]