Skip to content
This repository has been archived by the owner on Mar 1, 2021. It is now read-only.

Commit

Permalink
Remove omp stuff in step3, step3 is only for single core performance
Browse files Browse the repository at this point in the history
  • Loading branch information
jianyuh committed Apr 21, 2016
1 parent 379064c commit bb8c6fb
Show file tree
Hide file tree
Showing 7 changed files with 26 additions and 152 deletions.
1 change: 0 additions & 1 deletion step3/dgemm/bl_dgemm_ref.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ void bl_dgemm_ref(
dgemm_( "N", "N", &m, &n, &k, &alpha,
XA, &lda, XB, &ldb, &beta, XC, &ldc );
#else
#pragma omp parallel for private( i, p )
for ( j = 0; j < n; j ++ ) {
for ( i = 0; i < m; i ++ ) {
for ( p = 0; p < k; p ++ ) {
Expand Down
96 changes: 0 additions & 96 deletions step3/dgemm/bl_dgemm_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -176,99 +176,3 @@ double bl_clock_helper()
#endif





// Code for work assignments
void bl_get_range( int n, int bf, int* start, int* end )
{
//int n_way = thread->n_way;
//int work_id = thread->work_id;
int n_way = omp_get_num_threads();
int work_id = omp_get_thread_num();


//printf( "n: %d, bf: %d, start: %d, end: %d, n_way: %d, work_id: %d\n,", n, bf, *start, *end, n_way, work_id );

int all_start = 0;
int all_end = n;

int size = all_end - all_start;

int n_bf_whole = size / bf;
int n_bf_left = size % bf;

int n_bf_lo = n_bf_whole / n_way;
int n_bf_hi = n_bf_whole / n_way;

// In this function, we partition the space between all_start and
// all_end into n_way partitions, each a multiple of block_factor
// with the exception of the one partition that recieves the
// "edge" case (if applicable).
//
// Here are examples of various thread partitionings, in units of
// the block_factor, when n_way = 4. (A '+' indicates the thread
// that receives the leftover edge case (ie: n_bf_left extra
// rows/columns in its sub-range).
// (all_start ... all_end)
// n_bf_whole _left hel n_th_lo _hi thr0 thr1 thr2 thr3
// 12 =0 f 0 4 3 3 3 3
// 12 >0 f 0 4 3 3 3 3+
// 13 >0 f 1 3 4 3 3 3+
// 14 >0 f 2 2 4 4 3 3+
// 15 >0 f 3 1 4 4 4 3+
// 15 =0 f 3 1 4 4 4 3
//
// 12 =0 t 4 0 3 3 3 3
// 12 >0 t 4 0 3+ 3 3 3
// 13 >0 t 3 1 3+ 3 3 4
// 14 >0 t 2 2 3+ 3 4 4
// 15 >0 t 1 3 3+ 4 4 4
// 15 =0 t 1 3 3 4 4 4

// As indicated by the table above, load is balanced as equally
// as possible, even in the presence of an edge case.

// First, we must differentiate between cases where the leftover
// "edge" case (n_bf_left) should be allocated to a thread partition
// at the low end of the index range or the high end.

// Notice that if all threads receive the same number of
// block_factors, those threads are considered "high" and
// the "low" thread group is empty.
int n_th_lo = n_bf_whole % n_way;
//int n_th_hi = n_way - n_th_lo;

// If some partitions must have more block_factors than others
// assign the slightly larger partitions to lower index threads.
if ( n_th_lo != 0 ) n_bf_lo += 1;

// Compute the actual widths (in units of rows/columns) of
// individual threads in the low and high groups.
int size_lo = n_bf_lo * bf;
int size_hi = n_bf_hi * bf;

// Precompute the starting indices of the low and high groups.
int lo_start = all_start;
int hi_start = all_start + n_th_lo * size_lo;

// Compute the start and end of individual threads' ranges
// as a function of their work_ids and also the group to which
// they belong (low or high).
if ( work_id < n_th_lo )
{
*start = lo_start + (work_id ) * size_lo;
*end = lo_start + (work_id+1) * size_lo;
}
else // if ( n_th_lo <= work_id )
{
*start = hi_start + (work_id-n_th_lo ) * size_hi;
*end = hi_start + (work_id-n_th_lo+1) * size_hi;

// Since the edge case is being allocated to the high
// end of the index range, we have to advance the last
// thread's end.
if ( work_id == n_way - 1 ) *end += n_bf_left;
}

}
73 changes: 22 additions & 51 deletions step3/dgemm/my_dgemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,10 @@
* */

#include <stdio.h>
#include <omp.h>

#include "bl_dgemm_kernel.h"
#include "bl_dgemm.h"

#define min( i, j ) ( (i)<(j) ? (i): (j) )

inline void packA_mcxkc_d(
int m,
int k,
Expand Down Expand Up @@ -132,15 +129,6 @@ void bl_macro_kernel(

aux.b_next = packB;

// We can also parallelize with OMP here.
//// sequential is the default situation
//bl_ic_nt = 1;
//// check the environment variable
//str = getenv( "BLISLAB_IC_NT" );
//if ( str != NULL ) {
// bl_ic_nt = (int)strtol( str, NULL, 10 );
//}
//#pragma omp parallel for num_threads( bl_ic_nt ) private( j, i, aux )
for ( j = 0; j < n; j += DGEMM_NR ) { // 2-th loop around micro-kernel
aux.n = min( n - j, DGEMM_NR );
for ( i = 0; i < m; i += DGEMM_MR ) { // 1-th loop around micro-kernel
Expand Down Expand Up @@ -203,7 +191,6 @@ void bl_dgemm(
for ( pc = 0; pc < k; pc += DGEMM_KC ) { // 4-th loop around micro-kernel
pb = min( k - pc, DGEMM_KC );

#pragma omp parallel for num_threads( bl_ic_nt ) private( jr )
for ( j = 0; j < jb; j += DGEMM_NR ) {
packB_kcxnc_d(
min( jb - j, DGEMM_NR ),
Expand All @@ -215,52 +202,36 @@ void bl_dgemm(
);
}

//#pragma omp parallel for num_threads( bl_ic_nt ) private( ic, ib, i, ir )
#pragma omp parallel num_threads( bl_ic_nt ) private( ic, ib, i, ir )
{
int tid = omp_get_thread_num();
int my_start;
int my_end;

bl_get_range( m, DGEMM_MR, &my_start, &my_end );

for ( ic = my_start; ic < my_end; ic += DGEMM_MC ) { // 3-rd loop around micro-kernel

ib = min( my_end - ic, DGEMM_MC );
for ( ic = 0; ic < m; ic += DGEMM_MC ) { // 3-rd loop around micro-kernel

for ( i = 0; i < ib; i += DGEMM_MR ) {
packA_mcxkc_d(
min( ib - i, DGEMM_MR ),
pb,
&XA[ pc * lda ],
m,
ic + i,
&packA[ tid * DGEMM_MC * pb + i * pb ]
);
}
ib = min( m - ic, DGEMM_MC );

bl_macro_kernel(
ib,
jb,
for ( i = 0; i < ib; i += DGEMM_MR ) {
packA_mcxkc_d(
min( ib - i, DGEMM_MR ),
pb,
packA + tid * DGEMM_MC * pb,
packB,
&C[ jc * ldc + ic ],
ldc
&XA[ pc * lda ],
m,
ic + i,
&packA[ 0 * DGEMM_MC * pb + i * pb ]
);
}

} // End 3.rd loop around micro-kernel

}
} // End 4.th loop around micro-kernel
} // End 5.th loop around micro-kernel
bl_macro_kernel(
ib,
jb,
pb,
packA + 0 * DGEMM_MC * pb,
packB,
&C[ jc * ldc + ic ],
ldc
);
} // End 3.rd loop around micro-kernel
} // End 4.th loop around micro-kernel
} // End 5.th loop around micro-kernel

free( packA );
free( packB );
}


//bf = m_R = 8



2 changes: 2 additions & 0 deletions step3/include/bl_dgemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ extern "C" {

#include "bl_config.h"

#define min( i, j ) ( (i)<(j) ? (i): (j) )

#define A( i, j ) A[ (j)*lda + (i) ]
#define B( i, j ) B[ (j)*ldb + (i) ]
#define C( i, j ) C[ (j)*ldc + (i) ]
Expand Down
2 changes: 0 additions & 2 deletions step3/test/test_bl_dgemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@
* */


#include <omp.h>

#include "bl_dgemm.h"

#define USE_SET_DIFF 1
Expand Down
2 changes: 0 additions & 2 deletions step4/dgemm/my_dgemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,6 @@
#include "bl_dgemm_kernel.h"
#include "bl_dgemm.h"

#define min( i, j ) ( (i)<(j) ? (i): (j) )

inline void packA_mcxkc_d(
int m,
int k,
Expand Down
2 changes: 2 additions & 0 deletions step4/include/bl_dgemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ extern "C" {

#include "bl_config.h"

#define min( i, j ) ( (i)<(j) ? (i): (j) )

#define A( i, j ) A[ (j)*lda + (i) ]
#define B( i, j ) B[ (j)*ldb + (i) ]
#define C( i, j ) C[ (j)*ldc + (i) ]
Expand Down

0 comments on commit bb8c6fb

Please sign in to comment.