Remove omp stuff in step3, step3 is only for single core performance

sarahgorring · Apr 21, 2016 · bb8c6fb · bb8c6fb
1 parent 379064c
commit bb8c6fb
Show file tree

Hide file tree

Showing 7 changed files with 26 additions and 152 deletions.
diff --git a/step3/dgemm/bl_dgemm_ref.c b/step3/dgemm/bl_dgemm_ref.c
@@ -81,7 +81,6 @@ void bl_dgemm_ref(
     dgemm_( "N", "N", &m, &n, &k, &alpha,
             XA, &lda, XB, &ldb, &beta, XC, &ldc );
 #else
-    #pragma omp parallel for private( i, p )
     for ( j = 0; j < n; j ++ ) {
         for ( i = 0; i < m; i ++ ) {
             for ( p = 0; p < k; p ++ ) {

diff --git a/step3/dgemm/bl_dgemm_util.c b/step3/dgemm/bl_dgemm_util.c
@@ -176,99 +176,3 @@ double bl_clock_helper()
 #endif
 
 
-
-
-
-// Code for work assignments
-void bl_get_range( int n, int bf, int* start, int* end )
-{
-	//int      n_way      = thread->n_way;
-	//int      work_id    = thread->work_id;
-    int      n_way      = omp_get_num_threads();
-    int      work_id    = omp_get_thread_num();
-
-
-    //printf( "n: %d, bf: %d, start: %d, end: %d, n_way: %d, work_id: %d\n,", n, bf, *start, *end, n_way, work_id );
-
-	int      all_start  = 0;
-	int      all_end    = n;
-
-	int      size       = all_end - all_start;
-
-	int      n_bf_whole = size / bf;
-	int      n_bf_left  = size % bf;
-
-	int      n_bf_lo    = n_bf_whole / n_way;
-	int      n_bf_hi    = n_bf_whole / n_way;
-
-	// In this function, we partition the space between all_start and
-	// all_end into n_way partitions, each a multiple of block_factor
-	// with the exception of the one partition that recieves the
-	// "edge" case (if applicable).
-	//
-	// Here are examples of various thread partitionings, in units of
-	// the block_factor, when n_way = 4. (A '+' indicates the thread
-	// that receives the leftover edge case (ie: n_bf_left extra
-	// rows/columns in its sub-range).
-	//                                        (all_start ... all_end)
-	// n_bf_whole  _left  hel  n_th_lo  _hi   thr0  thr1  thr2  thr3
-	//         12     =0    f        0    4      3     3     3     3
-	//         12     >0    f        0    4      3     3     3     3+
-	//         13     >0    f        1    3      4     3     3     3+
-	//         14     >0    f        2    2      4     4     3     3+
-	//         15     >0    f        3    1      4     4     4     3+
-	//         15     =0    f        3    1      4     4     4     3 
-	//
-	//         12     =0    t        4    0      3     3     3     3
-	//         12     >0    t        4    0      3+    3     3     3
-	//         13     >0    t        3    1      3+    3     3     4
-	//         14     >0    t        2    2      3+    3     4     4
-	//         15     >0    t        1    3      3+    4     4     4
-	//         15     =0    t        1    3      3     4     4     4
-
-	// As indicated by the table above, load is balanced as equally
-	// as possible, even in the presence of an edge case.
-
-	// First, we must differentiate between cases where the leftover
-	// "edge" case (n_bf_left) should be allocated to a thread partition
-	// at the low end of the index range or the high end.
-
-		// Notice that if all threads receive the same number of
-		// block_factors, those threads are considered "high" and
-		// the "low" thread group is empty.
-		int n_th_lo = n_bf_whole % n_way;
-		//int n_th_hi = n_way - n_th_lo;
-
-		// If some partitions must have more block_factors than others
-		// assign the slightly larger partitions to lower index threads.
-		if ( n_th_lo != 0 ) n_bf_lo += 1;
-
-		// Compute the actual widths (in units of rows/columns) of
-		// individual threads in the low and high groups.
-		int size_lo = n_bf_lo * bf;
-		int size_hi = n_bf_hi * bf;
-
-		// Precompute the starting indices of the low and high groups.
-		int lo_start = all_start;
-		int hi_start = all_start + n_th_lo * size_lo;
-
-		// Compute the start and end of individual threads' ranges
-		// as a function of their work_ids and also the group to which
-		// they belong (low or high).
-		if ( work_id < n_th_lo )
-		{
-			*start = lo_start + (work_id  ) * size_lo;
-			*end   = lo_start + (work_id+1) * size_lo;
-		}
-		else // if ( n_th_lo <= work_id )
-		{
-			*start = hi_start + (work_id-n_th_lo  ) * size_hi;
-			*end   = hi_start + (work_id-n_th_lo+1) * size_hi;
-
-			// Since the edge case is being allocated to the high
-			// end of the index range, we have to advance the last
-			// thread's end.
-			if ( work_id == n_way - 1 ) *end += n_bf_left;
-		}
-
-}
diff --git a/step3/dgemm/my_dgemm.c b/step3/dgemm/my_dgemm.c
@@ -44,13 +44,10 @@
  * */
 
 #include <stdio.h>
-#include <omp.h>
 
 #include "bl_dgemm_kernel.h"
 #include "bl_dgemm.h"
 
-#define min( i, j ) ( (i)<(j) ? (i): (j) )
-
 inline void packA_mcxkc_d(
         int    m,
         int    k,
@@ -132,15 +129,6 @@ void bl_macro_kernel(
 
     aux.b_next = packB;
 
-    // We can also parallelize with OMP here.
-    //// sequential is the default situation
-    //bl_ic_nt = 1;
-    //// check the environment variable
-    //str = getenv( "BLISLAB_IC_NT" );
-    //if ( str != NULL ) {
-    //    bl_ic_nt = (int)strtol( str, NULL, 10 );
-    //}
-    //#pragma omp parallel for num_threads( bl_ic_nt ) private( j, i, aux )
     for ( j = 0; j < n; j += DGEMM_NR ) {                        // 2-th loop around micro-kernel
         aux.n  = min( n - j, DGEMM_NR );
         for ( i = 0; i < m; i += DGEMM_MR ) {                    // 1-th loop around micro-kernel
@@ -203,7 +191,6 @@ void bl_dgemm(
         for ( pc = 0; pc < k; pc += DGEMM_KC ) {                                   // 4-th loop around micro-kernel
             pb = min( k - pc, DGEMM_KC );
 
-            #pragma omp parallel for num_threads( bl_ic_nt ) private( jr )
             for ( j = 0; j < jb; j += DGEMM_NR ) {
                 packB_kcxnc_d(
                         min( jb - j, DGEMM_NR ),
@@ -215,52 +202,36 @@ void bl_dgemm(
                         );
             }
 
-            //#pragma omp parallel for num_threads( bl_ic_nt ) private( ic, ib, i, ir )
-            #pragma omp parallel num_threads( bl_ic_nt ) private( ic, ib, i, ir )
-            {
-                int     tid      = omp_get_thread_num();
-                int     my_start;
-                int     my_end;
-
-                bl_get_range( m, DGEMM_MR, &my_start, &my_end );
-
-                for ( ic = my_start; ic < my_end; ic += DGEMM_MC ) {              // 3-rd loop around micro-kernel
 
-                    ib = min( my_end - ic, DGEMM_MC );
+            for ( ic = 0; ic < m; ic += DGEMM_MC ) {                               // 3-rd loop around micro-kernel
 
-                    for ( i = 0; i < ib; i += DGEMM_MR ) {
-                        packA_mcxkc_d(
-                                min( ib - i, DGEMM_MR ),
-                                pb,
-                                &XA[ pc * lda ],
-                                m,
-                                ic + i,
-                                &packA[ tid * DGEMM_MC * pb + i * pb ]
-                                );
-                    }
+                ib = min( m - ic, DGEMM_MC );
 
-                    bl_macro_kernel(
-                            ib,
-                            jb,
+                for ( i = 0; i < ib; i += DGEMM_MR ) {
+                    packA_mcxkc_d(
+                            min( ib - i, DGEMM_MR ),
                             pb,
-                            packA  + tid * DGEMM_MC * pb,
-                            packB,
-                            &C[ jc * ldc + ic ], 
-                            ldc
+                            &XA[ pc * lda ],
+                            m,
+                            ic + i,
+                            &packA[ 0 * DGEMM_MC * pb + i * pb ]
                             );
+                }
 
-                }                                                                // End 3.rd loop around micro-kernel
-
-            }
-        }                                                                        // End 4.th loop around micro-kernel
-    }                                                                            // End 5.th loop around micro-kernel
+                bl_macro_kernel(
+                        ib,
+                        jb,
+                        pb,
+                        packA  + 0 * DGEMM_MC * pb,
+                        packB,
+                        &C[ jc * ldc + ic ], 
+                        ldc
+                        );
+            }                                                                     // End 3.rd loop around micro-kernel
+        }                                                                         // End 4.th loop around micro-kernel
+    }                                                                             // End 5.th loop around micro-kernel
 
     free( packA );
     free( packB );
 }
 
-
-//bf = m_R = 8
-
-
-
diff --git a/step3/include/bl_dgemm.h b/step3/include/bl_dgemm.h
@@ -94,6 +94,8 @@ extern "C" {
 
 #include "bl_config.h"
 
+#define min( i, j ) ( (i)<(j) ? (i): (j) )
+
 #define A( i, j )     A[ (j)*lda + (i) ]
 #define B( i, j )     B[ (j)*ldb + (i) ]
 #define C( i, j )     C[ (j)*ldc + (i) ]

diff --git a/step3/test/test_bl_dgemm.c b/step3/test/test_bl_dgemm.c
@@ -44,8 +44,6 @@
  * */
 
 
-#include <omp.h>
-
 #include "bl_dgemm.h"
 
 #define USE_SET_DIFF 1

diff --git a/step4/dgemm/my_dgemm.c b/step4/dgemm/my_dgemm.c
@@ -49,8 +49,6 @@
 #include "bl_dgemm_kernel.h"
 #include "bl_dgemm.h"
 
-#define min( i, j ) ( (i)<(j) ? (i): (j) )
-
 inline void packA_mcxkc_d(
         int    m,
         int    k,

diff --git a/step4/include/bl_dgemm.h b/step4/include/bl_dgemm.h
@@ -94,6 +94,8 @@ extern "C" {
 
 #include "bl_config.h"
 
+#define min( i, j ) ( (i)<(j) ? (i): (j) )
+
 #define A( i, j )     A[ (j)*lda + (i) ]
 #define B( i, j )     B[ (j)*ldb + (i) ]
 #define C( i, j )     C[ (j)*ldc + (i) ]
-Original file line number
+Diff line change
@@ Expand Up / @@ -44,8 +44,6 @@ @@
      * */
-    #include <omp.h>
     #include "bl_dgemm.h"
     #define USE_SET_DIFF 1
@@ Expand Down @@