From 0feef43e989d8e8222704c8fbc0f383eabb4e94a Mon Sep 17 00:00:00 2001
From: John Bowman <bowman@ualberta.ca>
Date: Sat, 27 Aug 2022 00:32:17 -0600
Subject: [PATCH] Implement running median in statistics.h; use median time for
 optimization; optionally normalize within mult routine.

---
 statistics.h         | 38 ++++++++++++++++++++++++++++++++++++--
 tests/cconv.cc       |  4 ++--
 tests/conv.cc        |  2 +-
 tests/convolve.cc    | 27 ++++++++++++++-------------
 tests/convolve.h     |  8 ++++----
 tests/explicit.cc    | 27 ++++++++++++++++++---------
 tests/explicit.h     |  6 +++++-
 tests/hybridconv.cc  | 18 +++++++++++++++++-
 tests/hybridconvh.cc | 18 +++++++++++++++++-
 tests/timing.h       | 21 +++++++++------------
 tests/timing.py      |  2 +-
 11 files changed, 124 insertions(+), 47 deletions(-)
diff --git a/statistics.h b/statistics.h
index 4eed7f79..b8d00735 100644
--- a/statistics.h
+++ b/statistics.h
@@ -2,6 +2,7 @@
 #define __statistics_h__ 1
 
 #include <cfloat>
+#include <list>
 
 namespace utils {
 
@@ -12,9 +13,11 @@ class statistics {
   double varH;
   double m; // min
   double M; // max
+  std::list<double> T; // Only used if constructed with median=true
+  bool Median;
 public:
-  statistics() {clear();}
-  void clear() {N=0; A=varL=varH=0.0; m=DBL_MAX; M=-m;}
+  statistics(bool median=false) : Median(median) {clear();}
+  void clear() {N=0; A=varL=varH=0.0; m=DBL_MAX; M=-m; T.clear();}
   double count() {return N;}
   double mean() {return A;}
   double min() {return m;}
@@ -30,6 +33,27 @@ class statistics {
       varL += v;
     else
       varH += v;
+
+    if(Median) {
+      auto p=T.begin();
+      if(N == 1 || t <= *p) T.push_front(t);
+      else {
+        size_t l=0;
+        size_t u=N-1;
+        ssize_t last=0;
+        while(l < u) {
+          ssize_t i=(l+u)/2;
+          std::advance(p,i-last);
+          last=i;
+          if(t < *p) u=i;
+          else {
+            ++p;
+            if(p == T.end() || t <= *p) {T.insert(p,t); break;}
+            last=l=i+1;
+          }
+        }
+      }
+    }
   }
   double stdev(double var, double f) {
     double factor=N > f ? f/(N-f) : 0.0;
@@ -44,6 +68,16 @@ class statistics {
   double stdevH() {
     return stdev(varH,2.0);
   }
+  double median() {
+    if(!Median) {
+      std::cerr << "Constructor requires median=true" << std::endl;
+      exit(-1);
+    }
+    unsigned int h=N/2;
+    auto p=T.begin();
+    std::advance(p,h-1);
+    return 2*h == N ? 0.5*(*p+*(++p)) : *(++p);
+  }
   void output(const char *text, unsigned int m) {
     std::cout << text << ":\n"
               << m << "\t"
diff --git a/tests/cconv.cc b/tests/cconv.cc
index fa34ebb0..4e06f82e 100644
--- a/tests/cconv.cc
+++ b/tests/cconv.cc
@@ -219,7 +219,7 @@ int main(int argc, char* argv[])
 
     timings("Implicit",m,T,N,stats);
 
-    if(m < 100) {
+    if(m < 100 && false) {
       for(unsigned int b=0; b < B; ++b) {
         for(unsigned int i=0; i < m; i++)
           cout << F[b][i] << endl;
@@ -252,7 +252,7 @@ int main(int argc, char* argv[])
     cout << endl;
     timings("Explicit",m,T,N,stats);
 
-    if(m < 100)
+    if(m < 100 && false)
       for(unsigned int i=0; i < m; i++)
         cout << F[0][i] << endl;
     else cout << F[0][0] << endl;
diff --git a/tests/conv.cc b/tests/conv.cc
index 3b01d3dc..14839938 100644
--- a/tests/conv.cc
+++ b/tests/conv.cc
@@ -279,7 +279,7 @@ int main(int argc, char* argv[])
     cout << endl;
     timings("Explicit",2*m-1,T,N,stats);
 
-    if(m < 100 && false)
+    if(m < 100)
       for(unsigned int i=0; i < m; i++)
         cout << f[i] << endl;
     else
diff --git a/tests/convolve.cc b/tests/convolve.cc
index f1814d7f..0b4505e4 100644
--- a/tests/convolve.cc
+++ b/tests/convolve.cc
@@ -227,6 +227,7 @@ void fftBase::OptBase::check(unsigned int L, unsigned int M,
   if(q == 1 || valid(D,p,S)) {
     // cout << "D=" << D << ", m=" << m << endl;
     double t=time(L,M,C,S,m,q,D,app);
+    // cout << p << " " << q << ": " << t << endl;
     if(t < T) {
       this->m=m;
       this->q=q;
@@ -276,28 +277,28 @@ fftBase::~fftBase()
     deleteAlign(ZetaqmS0);
 }
 
-double fftBase::mintime(Application& app, double *Stdev)
+double fftBase::medianTime(Application& app, double *Stdev)
 {
   unsigned int K=1;
   double eps=0.1;
 
-  statistics Stats;
+  statistics Stats(true);
   app.init(*this);
+  static ratio r=chrono::steady_clock::period();
+  double threshold=5000*r.num/r.den;
   while(true) {
-    Stats.add(app.time(*this,K));
-    double mean=Stats.mean();
-    double min=Stats.min();
+    double t=app.time(*this,K)/K;
+    Stats.add(t);
     double stdev=Stats.stdev();
     if(Stats.count() < 7) continue;
-    int threshold=5000;
-    ratio r=chrono::steady_clock::period();
-    if(min*r.den < threshold*r.num || eps*mean < stdev) {
+    double value=Stats.median();
+    if(value < threshold || eps*value < stdev) {
       K *= 2;
       Stats.clear();
     } else {
-      if(Stdev) *Stdev=stdev/K;
+      if(Stdev) *Stdev=stdev;
       app.clear();
-      return min/K;
+      return value;
     }
   }
   return 0.0;
@@ -308,11 +309,11 @@ double fftBase::report(Application& app)
   double stdev;
   cout << endl;
 
-  double min=mintime(app,&stdev);
+  double median=medianTime(app,&stdev);
 
-  cout << "min=" << min << " stdev=" << stdev << endl;
+  cout << "median=" << median << " stdev=" << stdev << endl;
 
-  return min;
+  return median;
 }
 
 void fftBase::common()
diff --git a/tests/convolve.h b/tests/convolve.h
index 3a611eb5..a8bcbe65 100644
--- a/tests/convolve.h
+++ b/tests/convolve.h
@@ -387,7 +387,7 @@ class fftBase : public ThreadBase {
     return overwrite && A >= B;
   }
 
-  double mintime(Application& app, double *Stdev=NULL);
+  double medianTime(Application& app, double *Stdev=NULL);
   double report(Application& app);
 };
 
@@ -412,7 +412,7 @@ class fftPad : public fftBase {
                 unsigned int m, unsigned int q,unsigned int D,
                 Application &app) {
       fftPad fft(L,M,C,S,m,q,D,app.Threads(),false);
-      return fft.mintime(app);
+      return fft.medianTime(app);
     }
   };
 
@@ -512,7 +512,7 @@ class fftPadCentered : public fftPad {
                 unsigned int m, unsigned int q, unsigned int D,
                 Application &app) {
       fftPadCentered fft(L,M,C,S,m,q,D,app.Threads());
-      return fft.mintime(app);
+      return fft.medianTime(app);
     }
   };
 
@@ -617,7 +617,7 @@ class fftPadHermitian : public fftBase {
                 unsigned int m, unsigned int q, unsigned int D,
                 Application &app) {
       fftPadHermitian fft(L,M,C,m,q,D,app.Threads());
-      return fft.mintime(app);
+      return fft.medianTime(app);
     }
   };
 
diff --git a/tests/explicit.cc b/tests/explicit.cc
index a8acc166..52a97591 100644
--- a/tests/explicit.cc
+++ b/tests/explicit.cc
@@ -4,6 +4,22 @@
 
 namespace fftwpp {
 
+// This multiplication routine is for binary convolutions and takes two inputs
+// of size m.
+// F[0][j] *= F[1][j];
+void multbinary(Complex **F, unsigned int m, unsigned int threads)
+{
+  Complex* F0=F[0];
+  Complex* F1=F[1];
+
+  double ninv=1.0/m;
+
+  PARALLEL(
+    for(unsigned int j=0; j < m; ++j)
+      F0[j] *= ninv*F1[j];
+    );
+}
+
 // This multiplication routine is for binary Hermitian convolutions and takes
 // two inputs.
 // F[0][j] *= F[1][j];
@@ -30,7 +46,7 @@ void ExplicitConvolution::forwards(Complex *f)
   Forwards->fft(f);
 }
 
-void ExplicitConvolution::convolve(Complex **F, multiplier *mult)
+void ExplicitConvolution::convolve(Complex **F, Multiplier *mult)
 {
   const unsigned int A=2;
   const unsigned int B=1;
@@ -40,17 +56,10 @@ void ExplicitConvolution::convolve(Complex **F, multiplier *mult)
     backwards(F[a]);
   }
 
-  (*mult)(F,n,0,NULL,0,threads);
+  (*mult)(F,n,threads);
 
   for(unsigned int b=0; b < B; ++b)
     forwards(F[b]);
-
-  double ninv=1.0/n;
-  for(unsigned int b=0; b < B; ++b) {
-    Complex *f=F[b];
-    for(unsigned int j=0; j < m; ++j)
-      f[j] *= ninv;
-  }
 }
 
 void ExplicitConvolution::convolve(Complex *f, Complex *g)
diff --git a/tests/explicit.h b/tests/explicit.h
index bc38bfe2..84c486df 100644
--- a/tests/explicit.h
+++ b/tests/explicit.h
@@ -5,9 +5,13 @@
 
 namespace fftwpp {
 
+// Specialized mult routines for testing explicit convolutions:
+typedef void Multiplier(Complex **, unsigned int m,
+                        unsigned int threads);
 typedef void Realmultiplier(double **, unsigned int m,
                             unsigned int threads);
 
+Multiplier multbinary;
 Realmultiplier multbinary;
 
 class ExplicitPad : public ThreadBase {
@@ -222,7 +226,7 @@ class ExplicitConvolution : public ExplicitPad {
   void forwards(Complex *f);
 
   // F is an array of pointers to distinct data blocks each of size n.
-  void convolve(Complex **F, multiplier *mult);
+  void convolve(Complex **F, Multiplier *mult);
 
   // Compute f (*) g. The distinct input arrays f and g are each of size n
   // (contents not preserved). The output is returned in f.
diff --git a/tests/hybridconv.cc b/tests/hybridconv.cc
index 56fab9d0..6bf4a911 100644
--- a/tests/hybridconv.cc
+++ b/tests/hybridconv.cc
@@ -14,6 +14,22 @@ unsigned int B=1; // number of outputs
 unsigned int L=512; // input data length
 unsigned int M=1024; // minimum padded length
 
+// This multiplication routine is for binary convolutions and takes
+// two Complex inputs of size n and outputs one Complex value.
+// F0[j] *= F1[j];
+void multbinaryNormalized(Complex **F, unsigned int offset, unsigned int n,
+                unsigned int threads)
+{
+  Complex *F0=F[0]+offset;
+  Complex *F1=F[1]+offset;
+
+  double ninv=1.0/n;
+  PARALLEL(
+    for(unsigned int j=0; j < n; ++j)
+      F0[j] *= ninv*F1[j];
+    );
+}
+
 int main(int argc, char* argv[])
 {
   fftw::maxthreads=get_max_threads();
@@ -62,7 +78,7 @@ int main(int argc, char* argv[])
 #endif
   for(unsigned int k=0; k < K; ++k) {
     seconds();
-    Convolve.convolve(f,multbinary);
+    Convolve.convolveRaw(f,multbinaryNormalized);
     T[k]=seconds();
   }
 
diff --git a/tests/hybridconvh.cc b/tests/hybridconvh.cc
index 823b9dc5..3db34a4b 100644
--- a/tests/hybridconvh.cc
+++ b/tests/hybridconvh.cc
@@ -13,6 +13,22 @@ unsigned int B=1; // number of outputs
 unsigned int L=512; // input data length
 unsigned int M=768; // minimum padded length
 
+// This multiplication routine is for binary convolutions and takes
+// two real inputs of size n.
+// F0[j] *= F1[j];
+void realmultbinaryNormalized(Complex **F, unsigned int offset, unsigned int n,
+                              unsigned int threads)
+{
+  double *F0=(double *) (F[0]+offset);
+  double *F1=(double *) (F[1]+offset);
+
+  double ninv=1.0/n;
+  PARALLEL(
+    for(unsigned int j=0; j < n; ++j)
+      F0[j] *= ninv*F1[j];
+    );
+}
+
 int main(int argc, char* argv[])
 {
   fftw::maxthreads=get_max_threads();
@@ -62,7 +78,7 @@ int main(int argc, char* argv[])
 #endif
   for(unsigned int k=0; k < K; ++k) {
     seconds();
-    Convolve.convolve(f,realmultbinary);
+    Convolve.convolveRaw(f,realmultbinaryNormalized);
     T[k]=seconds();
   }
 
diff --git a/tests/timing.h b/tests/timing.h
index bbf60de4..8a1d7a6a 100644
--- a/tests/timing.h
+++ b/tests/timing.h
@@ -9,6 +9,13 @@
 
 enum timing_algorithm {WRITETOFILE = -1, MEAN, MIN, MAX, MEDIAN, P90, P80, P50};
 
+inline double median(double *T, unsigned int N) // TODO: Replace with selection algorithm
+{
+  std::sort(T,T+N);
+  unsigned int h=N/2;
+  return 2*h == N ? 0.5*(T[h-1]+T[h]) : T[h];
+}
+
 inline double value(double *T, unsigned int N, int algorithm)
 {
   switch(algorithm) {
@@ -18,7 +25,6 @@ inline double value(double *T, unsigned int N, int algorithm)
       for(unsigned int i=0; i < N; ++i)
         sum += T[i];
       return sum/N;
-      break;
     }
     case MIN: {
       double min=T[0];
@@ -27,7 +33,6 @@ inline double value(double *T, unsigned int N, int algorithm)
           min=T[i];
       }
       return min;
-      break;
     }
     case MAX: {
       double max=T[0];
@@ -36,14 +41,9 @@ inline double value(double *T, unsigned int N, int algorithm)
           max=T[i];
       }
       return max;
-      break;
-    }
-    case MEDIAN: { // TODO: Replace with selection algorithm
-      std::sort(T,T+N);
-      unsigned int h=N/2;
-      return 2*h == N ? (T[h-1]+T[h])/2 : T[h];
-      break;
     }
+    case MEDIAN:
+      return median(T,N);
     case P90: {
       std::sort(T,T+N);
       unsigned int start=(int)ceil(N*0.05);
@@ -53,7 +53,6 @@ inline double value(double *T, unsigned int N, int algorithm)
       for(unsigned int i=start; i < stop; ++i)
         sum += T[i];
       return sum/(stop-start);
-      break;
     }
     case P80: {
       std::sort(T,T+N);
@@ -64,7 +63,6 @@ inline double value(double *T, unsigned int N, int algorithm)
       for(unsigned int i=start; i < stop; ++i)
         sum += T[i];
       return sum/(stop-start);
-      break;
     }
     case P50: {
       std::sort(T,T+N);
@@ -75,7 +73,6 @@ inline double value(double *T, unsigned int N, int algorithm)
       for(unsigned int i=start; i < stop; ++i)
         sum += T[i];
       return sum/(stop-start);
-      break;
     }
     default:
       std::cout << "Error: invalid algorithm choice: "
diff --git a/tests/timing.py b/tests/timing.py
index 6ccb4858..bfa85ebd 100755
--- a/tests/timing.py
+++ b/tests/timing.py
@@ -481,7 +481,7 @@ def main(argv):
 
             if dothism:
                 if(hybrid):
-                    mcmd=cmd+["-L"+str(L)]+["-M"+str(M)] #+["-I0"]
+                    mcmd=cmd+["-L"+str(L)]+["-M"+str(M)] #+["-I1"]
                 else:
                     mcmd = cmd + ["-m" + str(m)]