Skip to content

Commit

Permalink
Split adjoint sources array
Browse files Browse the repository at this point in the history
  • Loading branch information
Etienne Bachmann authored and Etienne Bachmann committed Jun 8, 2017
1 parent 8edda4e commit 30f6b22
Show file tree
Hide file tree
Showing 14 changed files with 246 additions and 621 deletions.
95 changes: 29 additions & 66 deletions src/cuda/compute_add_sources_acoustic_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ void FC_FUNC_(compute_add_sources_ac_cuda,
mp->d_kappastore,
NSOURCES);


#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("compute_add_sources_ac_cuda");
#endif
Expand All @@ -138,10 +139,9 @@ void FC_FUNC_(compute_add_sources_ac_s3_cuda,
if (mp->nsources_local == 0) return;

int NSOURCES = *NSOURCESf;

// copies source time factors onto GPU
print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
NSOURCES*sizeof(double),cudaMemcpyHostToDevice),55);

int num_blocks_x, num_blocks_y;
get_blocks_xy(NSOURCES,&num_blocks_x,&num_blocks_y);
Expand Down Expand Up @@ -174,7 +174,12 @@ void FC_FUNC_(compute_add_sources_ac_s3_cuda,

__global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(realw* potential_dot_dot_acoustic,
int nrec,
realw* adj_sourcearrays,
int it,
int NSTEP_BETWEEN_ADJSRC,
realw* source_adjoint,
realw* xir_store,
realw* etar_store,
realw* gammar_store,
int* d_ibool,
int* ispec_is_acoustic,
int* ispec_selected_rec,
Expand All @@ -197,8 +202,11 @@ __global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(realw* potential_dot_dot_a

int iglob = d_ibool[INDEX4_PADDED(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;

//kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];

realw kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
realw xir = xir_store[INDEX2(nadj_rec_local,irec_local,i)];
realw etar = etar_store[INDEX2(nadj_rec_local,irec_local,j)];
realw gammar = gammar_store[INDEX2(nadj_rec_local,irec_local,k)];
realw source_adj = source_adjoint[INDEX3(nadj_rec_local,NSTEP_BETWEEN_ADJSRC,irec_local,it,0)];
//potential_dot_dot_acoustic[iglob] += adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
// pre_computed_irec_local_index[irec],
// pre_computed_index,
Expand All @@ -207,11 +215,10 @@ __global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(realw* potential_dot_dot_a

// beware, for acoustic medium, a pressure source would be taking the negative
// and divide by Kappa of the fluid;
// this would have to be done when constructing the adjoint source.
//
// note: we take the first component of the adj_sourcearrays
// the idea is to have e.g. a pressure source, where all 3 components would be the same
realw stf = adj_sourcearrays[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,0,irec_local)]; // / kappal

realw stf = - source_adj * xir * etar * gammar / kappal;

atomicAdd(&potential_dot_dot_acoustic[iglob],stf);

Expand All @@ -229,14 +236,11 @@ __global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(realw* potential_dot_dot_a
extern "C"
void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
realw* h_adj_sourcearrays,
int* h_ispec_is_acoustic,
int* h_ispec_selected_rec,
int* nrec,
int* time_index,
int* h_islice_selected_rec,
int* nadj_rec_local,
int* NTSTEP_BETWEEN_READ_ADJSRC) {
realw* h_source_adjoint,
int* nrec,
int* nadj_rec_local,
int* NTSTEP_BETWEEN_READ_ADJSRC,
int* it) {

TRACE("add_sources_ac_sim_2_or_3_cuda");

Expand All @@ -250,67 +254,26 @@ void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,

dim3 grid(num_blocks_x,num_blocks_y,1);
dim3 threads(5,5,5);

// build slice of adj_sourcearrays because full array is *very* large.
// note: this extracts array values for local adjoint sources at given time step "time_index"
// from large adj_sourcearrays array into h_adj_sourcearrays_slice
int ispec,i,j,k,irec_local,it_index;

it_index = (*time_index) - 1;
irec_local = 0;

for(int irec = 0; irec < *nrec; irec++) {
if (mp->myrank == h_islice_selected_rec[irec]) {
// takes only acoustic sources
ispec = h_ispec_selected_rec[irec] - 1;

// only for acoustic elements
if (h_ispec_is_acoustic[ispec]){
for(k=0;k<5;k++) {
for(j=0;j<5;j++) {
for(i=0;i<5;i++) {

mp->h_adj_sourcearrays_slice[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,0,irec_local)]
= h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
*NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLX,
irec_local,it_index,0,i,j,k)];

mp->h_adj_sourcearrays_slice[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,1,irec_local)]
= h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
*NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLX,
irec_local,it_index,1,i,j,k)];

mp->h_adj_sourcearrays_slice[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,2,irec_local)]
= h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
*NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLX,
irec_local,it_index,2,i,j,k)];
}
}
}
} // h_ispec_is_acoustic

// increases local receivers counter
irec_local++;
}
}
// check all local sources were added
if (irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");

int it_index = *NTSTEP_BETWEEN_READ_ADJSRC - (*it-1) % *NTSTEP_BETWEEN_READ_ADJSRC - 1 ;
// copies extracted array values onto GPU
print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
(mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),99099);
if ( (*it-1) % *NTSTEP_BETWEEN_READ_ADJSRC==0) print_CUDA_error_if_any(cudaMemcpy(mp->d_source_adjoint,h_source_adjoint,
mp->nadj_rec_local*3*sizeof(realw)*(*NTSTEP_BETWEEN_READ_ADJSRC),cudaMemcpyHostToDevice),99099);

// launches cuda kernel for acoustic adjoint sources
add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_dot_dot_acoustic,
*nrec,
mp->d_adj_sourcearrays,
*nrec,it_index,*NTSTEP_BETWEEN_READ_ADJSRC,
mp->d_source_adjoint,
mp->d_hxir,
mp->d_hetar,
mp->d_hgammar,
mp->d_ibool,
mp->d_ispec_is_acoustic,
mp->d_ispec_selected_rec,
mp->d_pre_computed_irec,
mp->nadj_rec_local,
mp->d_kappastore);


#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("add_sources_acoustic_SIM_TYPE_2_OR_3_kernel");
#endif
Expand Down
106 changes: 35 additions & 71 deletions src/cuda/compute_add_sources_viscoelastic_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -224,13 +224,18 @@ TRACE("\tadd_source_master_rec_noise_cu");
/* ----------------------------------------------------------------------------------------------- */

__global__ void add_sources_el_SIM_TYPE_2_OR_3_kernel(realw* accel,
int nrec,
realw* adj_sourcearrays,
int* d_ibool,
int* ispec_is_elastic,
int* ispec_selected_rec,
int* pre_computed_irec,
int nadj_rec_local) {
int nrec,
int it,
int NSTEP_BETWEEN_ADJSRC,
realw* source_adjoint,
realw* xir_store,
realw* etar_store,
realw* gammar_store,
int* d_ibool,
int* ispec_is_elastic,
int* ispec_selected_rec,
int* pre_computed_irec,
int nadj_rec_local) {

int irec_local = blockIdx.x + gridDim.x*blockIdx.y;

Expand All @@ -245,11 +250,17 @@ __global__ void add_sources_el_SIM_TYPE_2_OR_3_kernel(realw* accel,
int j = threadIdx.y;
int k = threadIdx.z;
int iglob = d_ibool[INDEX4_PADDED(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
realw xir = xir_store[INDEX2(nadj_rec_local,irec_local,i)];
realw etar = etar_store[INDEX2(nadj_rec_local,irec_local,j)];
realw gammar = gammar_store[INDEX2(nadj_rec_local,irec_local,k)];
realw source_adj = source_adjoint[INDEX3(nadj_rec_local,NSTEP_BETWEEN_ADJSRC,irec_local,it,0)];

realw lagrange = xir * etar * gammar ;

// atomic operations are absolutely necessary for correctness!
atomicAdd(&accel[3*iglob],adj_sourcearrays[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,0,irec_local)]);
atomicAdd(&accel[1+3*iglob], adj_sourcearrays[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,1,irec_local)]);
atomicAdd(&accel[2+3*iglob],adj_sourcearrays[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,2,irec_local)]);
atomicAdd(&accel[3*iglob],source_adjoint[INDEX3(nadj_rec_local,NSTEP_BETWEEN_ADJSRC,irec_local,it,0)]*lagrange);
atomicAdd(&accel[1+3*iglob],source_adjoint[INDEX3(nadj_rec_local,NSTEP_BETWEEN_ADJSRC,irec_local,it,1)]*lagrange);
atomicAdd(&accel[2+3*iglob],source_adjoint[INDEX3(nadj_rec_local,NSTEP_BETWEEN_ADJSRC,irec_local,it,2)]*lagrange);
} // ispec_is_elastic
}

Expand All @@ -260,14 +271,11 @@ __global__ void add_sources_el_SIM_TYPE_2_OR_3_kernel(realw* accel,
extern "C"
void FC_FUNC_(add_sources_el_sim_type_2_or_3,
ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
realw* h_adj_sourcearrays,
int* h_ispec_is_elastic,
int* h_ispec_selected_rec,
int* nrec,
int* time_index,
int* h_islice_selected_rec,
int* nadj_rec_local,
int* NTSTEP_BETWEEN_READ_ADJSRC) {
realw* h_source_adjoint,
int* nrec,
int* nadj_rec_local,
int* NTSTEP_BETWEEN_READ_ADJSRC,
int* it) {

TRACE("\tadd_sources_el_sim_type_2_or_3");

Expand All @@ -282,64 +290,20 @@ void FC_FUNC_(add_sources_el_sim_type_2_or_3,
dim3 grid(num_blocks_x,num_blocks_y,1);
dim3 threads(5,5,5);

// build slice of adj_sourcearrays because full array is *very* large.
// note: this extracts array values for local adjoint sources at given time step "time_index"
// from large adj_sourcearrays array into h_adj_sourcearrays_slice
int ispec,i,j,k,irec_local,it_index;

it_index = (*time_index) - 1;
irec_local = 0;

for(int irec = 0; irec < *nrec; irec++) {
if (mp->myrank == h_islice_selected_rec[irec]) {
// takes only elastic sources
ispec = h_ispec_selected_rec[irec] - 1;

// only for elastic elements
if (h_ispec_is_elastic[ispec]){

for(k=0;k<NGLLX;k++) {
for(j=0;j<NGLLX;j++) {
for(i=0;i<NGLLX;i++) {

mp->h_adj_sourcearrays_slice[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,0,irec_local)]
= h_adj_sourcearrays[INDEX6(*nadj_rec_local,
*NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLX,
irec_local,it_index,0,i,j,k)];

mp->h_adj_sourcearrays_slice[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,1,irec_local)]
= h_adj_sourcearrays[INDEX6(*nadj_rec_local,
*NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLX,
irec_local,it_index,1,i,j,k)];

mp->h_adj_sourcearrays_slice[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,2,irec_local)]
= h_adj_sourcearrays[INDEX6(*nadj_rec_local,
*NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLX,
irec_local,it_index,2,i,j,k)];
}
}
}
} // h_ispec_is_elastic

// increases local receivers counter
irec_local++;
}
}
// check all local sources were added
if (irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");

int it_index = *NTSTEP_BETWEEN_READ_ADJSRC - (*it-1) % *NTSTEP_BETWEEN_READ_ADJSRC - 1 ;
// copies extracted array values onto GPU
print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
(mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),98001);
if ( (*it-1) % *NTSTEP_BETWEEN_READ_ADJSRC==0) print_CUDA_error_if_any(cudaMemcpy(mp->d_source_adjoint,h_source_adjoint,
mp->nadj_rec_local*3*sizeof(realw)*(*NTSTEP_BETWEEN_READ_ADJSRC),cudaMemcpyHostToDevice),99099);



// the irec_local variable needs to be precomputed (as
// h_pre_comp..), because normally it is in the loop updating accel,
// and due to how it's incremented, it cannot be parallelized

add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
*nrec,
mp->d_adj_sourcearrays,
*nrec,it_index,*NTSTEP_BETWEEN_READ_ADJSRC,
mp->d_source_adjoint,
mp->d_hxir,
mp->d_hetar,
mp->d_hgammar,
mp->d_ibool,
mp->d_ispec_is_elastic,
mp->d_ispec_selected_rec,
Expand Down
13 changes: 7 additions & 6 deletions src/cuda/mesh_constants_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,14 +234,14 @@ typedef texture<float, cudaTextureType1D, cudaReadModeElementType> realw_texture
// however, compiler tends to use texture loads for restricted memory arrays, which might slow down performance
//
// non-restricted (default)
typedef const realw* realw_const_p;
//typedef const realw* realw_const_p;
// restricted
//typedef const realw* __restrict__ realw_const_p;
typedef const realw* __restrict__ realw_const_p;
//
// non-restricted (default)
typedef realw* realw_p;
//typedef realw* realw_p;
// restricted
//typedef realw* __restrict__ realw_p;
typedef realw* __restrict__ realw_p;

// wrapper for global memory load function
// usage: val = get_global_cr( &A[index] );
Expand Down Expand Up @@ -408,11 +408,11 @@ typedef struct mesh_ {
realw* d_station_seismo_field;
realw* h_station_seismo_field;

double* d_hxir, *d_hetar, *d_hgammar;
realw* d_hxir, *d_hetar, *d_hgammar;
double* d_dxd, *d_dyd, *d_dzd;
double* d_vxd, *d_vyd, *d_vzd;
double* d_axd, *d_ayd, *d_azd;
realw* d_seismograms_d, *d_seismograms_v, *d_seismograms_a;
realw* d_seismograms_d, *d_seismograms_v, *d_seismograms_a, *d_seismograms_p;
double* d_nu;

realw* h_seismograms_d_it;
Expand All @@ -421,6 +421,7 @@ typedef struct mesh_ {

// adjoint receivers/sources
int nadj_rec_local;
realw* d_source_adjoint;
realw* d_adj_sourcearrays;
realw* h_adj_sourcearrays_slice;
int* d_pre_computed_irec;
Expand Down
Loading

0 comments on commit 30f6b22

Please sign in to comment.