Split adjoint sources array

proteus2 · Jun 8, 2017 · 30f6b22 · 30f6b22
1 parent 8edda4e
commit 30f6b22
Show file tree

Hide file tree

Showing 14 changed files with 246 additions and 621 deletions.
diff --git a/src/cuda/compute_add_sources_acoustic_cuda.cu b/src/cuda/compute_add_sources_acoustic_cuda.cu
@@ -117,6 +117,7 @@ void FC_FUNC_(compute_add_sources_ac_cuda,
                                                                               mp->d_kappastore,
                                                                               NSOURCES);
 
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("compute_add_sources_ac_cuda");
 #endif
@@ -138,10 +139,9 @@ void FC_FUNC_(compute_add_sources_ac_s3_cuda,
   if (mp->nsources_local == 0) return;
 
   int NSOURCES = *NSOURCESf;
-
   // copies source time factors onto GPU
   print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
-                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),55);
 
   int num_blocks_x, num_blocks_y;
   get_blocks_xy(NSOURCES,&num_blocks_x,&num_blocks_y);
@@ -174,7 +174,12 @@ void FC_FUNC_(compute_add_sources_ac_s3_cuda,
 
 __global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(realw* potential_dot_dot_acoustic,
                                                       int nrec,
-                                                      realw* adj_sourcearrays,
+                                                      int it,
+                                                      int NSTEP_BETWEEN_ADJSRC,
+                                                      realw* source_adjoint,
+                                                      realw* xir_store,
+                                                      realw* etar_store,
+                                                      realw* gammar_store,
                                                       int* d_ibool,
                                                       int* ispec_is_acoustic,
                                                       int* ispec_selected_rec,
@@ -197,8 +202,11 @@ __global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(realw* potential_dot_dot_a
 
       int iglob = d_ibool[INDEX4_PADDED(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
 
-      //kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
-
+      realw kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
+      realw xir    = xir_store[INDEX2(nadj_rec_local,irec_local,i)];
+      realw etar   = etar_store[INDEX2(nadj_rec_local,irec_local,j)];
+      realw gammar = gammar_store[INDEX2(nadj_rec_local,irec_local,k)];
+      realw source_adj = source_adjoint[INDEX3(nadj_rec_local,NSTEP_BETWEEN_ADJSRC,irec_local,it,0)];
       //potential_dot_dot_acoustic[iglob] += adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
       //                                            pre_computed_irec_local_index[irec],
       //                                            pre_computed_index,
@@ -207,11 +215,10 @@ __global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(realw* potential_dot_dot_a
 
       // beware, for acoustic medium, a pressure source would be taking the negative
       // and divide by Kappa of the fluid;
-      // this would have to be done when constructing the adjoint source.
       //
       // note: we take the first component of the adj_sourcearrays
-      //          the idea is to have e.g. a pressure source, where all 3 components would be the same
-      realw stf = adj_sourcearrays[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,0,irec_local)]; // / kappal
+
+      realw stf = - source_adj * xir * etar * gammar / kappal;
 
       atomicAdd(&potential_dot_dot_acoustic[iglob],stf);
 
@@ -229,14 +236,11 @@ __global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(realw* potential_dot_dot_a
 extern "C"
 void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
               ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
-                                               realw* h_adj_sourcearrays,
-                                               int* h_ispec_is_acoustic,
-                                               int* h_ispec_selected_rec,
-                                               int* nrec,
-                                               int* time_index,
-                                               int* h_islice_selected_rec,
-                                               int* nadj_rec_local,
-                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {
+                                              realw* h_source_adjoint,
+                                              int* nrec,
+                                              int* nadj_rec_local,
+                                              int* NTSTEP_BETWEEN_READ_ADJSRC,
+                                              int* it) {
 
   TRACE("add_sources_ac_sim_2_or_3_cuda");
 
@@ -250,67 +254,26 @@ void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
 
   dim3 grid(num_blocks_x,num_blocks_y,1);
   dim3 threads(5,5,5);
-
-  // build slice of adj_sourcearrays because full array is *very* large.
-  // note: this extracts array values for local adjoint sources at given time step "time_index"
-  //          from large adj_sourcearrays array into h_adj_sourcearrays_slice
-  int ispec,i,j,k,irec_local,it_index;
-
-  it_index = (*time_index) - 1;
-  irec_local = 0;
-
-  for(int irec = 0; irec < *nrec; irec++) {
-    if (mp->myrank == h_islice_selected_rec[irec]) {
-      // takes only acoustic sources
-      ispec = h_ispec_selected_rec[irec] - 1;
-
-      // only for acoustic elements
-      if (h_ispec_is_acoustic[ispec]){
-        for(k=0;k<5;k++) {
-          for(j=0;j<5;j++) {
-            for(i=0;i<5;i++) {
-
-              mp->h_adj_sourcearrays_slice[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,0,irec_local)]
-                = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
-                                          *NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLX,
-                                          irec_local,it_index,0,i,j,k)];
-
-              mp->h_adj_sourcearrays_slice[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,1,irec_local)]
-                = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
-                                          *NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLX,
-                                          irec_local,it_index,1,i,j,k)];
-
-              mp->h_adj_sourcearrays_slice[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,2,irec_local)]
-                = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
-                                          *NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLX,
-                                          irec_local,it_index,2,i,j,k)];
-            }
-          }
-        }
-      } // h_ispec_is_acoustic
-
-      // increases local receivers counter
-      irec_local++;
-    }
-  }
-  // check all local sources were added
-  if (irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
-
+  int it_index = *NTSTEP_BETWEEN_READ_ADJSRC - (*it-1) % *NTSTEP_BETWEEN_READ_ADJSRC - 1 ;
   // copies extracted array values onto GPU
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
-                                    (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),99099);
+  if ( (*it-1) % *NTSTEP_BETWEEN_READ_ADJSRC==0) print_CUDA_error_if_any(cudaMemcpy(mp->d_source_adjoint,h_source_adjoint,
+                                                                mp->nadj_rec_local*3*sizeof(realw)*(*NTSTEP_BETWEEN_READ_ADJSRC),cudaMemcpyHostToDevice),99099);
 
   // launches cuda kernel for acoustic adjoint sources
   add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_dot_dot_acoustic,
-                                                                                *nrec,
-                                                                                mp->d_adj_sourcearrays,
+                                                                                *nrec,it_index,*NTSTEP_BETWEEN_READ_ADJSRC,
+                                                                                mp->d_source_adjoint,
+                                                                                mp->d_hxir,
+                                                                                mp->d_hetar,
+                                                                                mp->d_hgammar,
                                                                                 mp->d_ibool,
                                                                                 mp->d_ispec_is_acoustic,
                                                                                 mp->d_ispec_selected_rec,
                                                                                 mp->d_pre_computed_irec,
                                                                                 mp->nadj_rec_local,
                                                                                 mp->d_kappastore);
 
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("add_sources_acoustic_SIM_TYPE_2_OR_3_kernel");
 #endif

diff --git a/src/cuda/compute_add_sources_viscoelastic_cuda.cu b/src/cuda/compute_add_sources_viscoelastic_cuda.cu
@@ -224,13 +224,18 @@ TRACE("\tadd_source_master_rec_noise_cu");
 /* ----------------------------------------------------------------------------------------------- */
 
 __global__ void add_sources_el_SIM_TYPE_2_OR_3_kernel(realw* accel,
-                                                     int nrec,
-                                                     realw* adj_sourcearrays,
-                                                     int* d_ibool,
-                                                     int* ispec_is_elastic,
-                                                     int* ispec_selected_rec,
-                                                     int* pre_computed_irec,
-                                                     int nadj_rec_local) {
+                                                      int nrec,
+                                                      int it,
+                                                      int NSTEP_BETWEEN_ADJSRC,
+                                                      realw* source_adjoint,
+                                                      realw* xir_store,
+                                                      realw* etar_store,
+                                                      realw* gammar_store,                                                    
+                                                      int* d_ibool,
+                                                      int* ispec_is_elastic,
+                                                      int* ispec_selected_rec,
+                                                      int* pre_computed_irec,
+                                                      int nadj_rec_local) {
 
   int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
 
@@ -245,11 +250,17 @@ __global__ void add_sources_el_SIM_TYPE_2_OR_3_kernel(realw* accel,
       int j = threadIdx.y;
       int k = threadIdx.z;
       int iglob = d_ibool[INDEX4_PADDED(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
+      realw xir    = xir_store[INDEX2(nadj_rec_local,irec_local,i)];
+      realw etar   = etar_store[INDEX2(nadj_rec_local,irec_local,j)];
+      realw gammar = gammar_store[INDEX2(nadj_rec_local,irec_local,k)];
+      realw source_adj = source_adjoint[INDEX3(nadj_rec_local,NSTEP_BETWEEN_ADJSRC,irec_local,it,0)];
+
+      realw lagrange =   xir * etar * gammar ;
 
       // atomic operations are absolutely necessary for correctness!
-      atomicAdd(&accel[3*iglob],adj_sourcearrays[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,0,irec_local)]);
-      atomicAdd(&accel[1+3*iglob], adj_sourcearrays[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,1,irec_local)]);
-      atomicAdd(&accel[2+3*iglob],adj_sourcearrays[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,2,irec_local)]);
+      atomicAdd(&accel[3*iglob],source_adjoint[INDEX3(nadj_rec_local,NSTEP_BETWEEN_ADJSRC,irec_local,it,0)]*lagrange);
+      atomicAdd(&accel[1+3*iglob],source_adjoint[INDEX3(nadj_rec_local,NSTEP_BETWEEN_ADJSRC,irec_local,it,1)]*lagrange);
+      atomicAdd(&accel[2+3*iglob],source_adjoint[INDEX3(nadj_rec_local,NSTEP_BETWEEN_ADJSRC,irec_local,it,2)]*lagrange);
     } // ispec_is_elastic
   }
 
@@ -260,14 +271,11 @@ __global__ void add_sources_el_SIM_TYPE_2_OR_3_kernel(realw* accel,
 extern "C"
 void FC_FUNC_(add_sources_el_sim_type_2_or_3,
               ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
-                                               realw* h_adj_sourcearrays,
-                                               int* h_ispec_is_elastic,
-                                               int* h_ispec_selected_rec,
-                                               int* nrec,
-                                               int* time_index,
-                                               int* h_islice_selected_rec,
-                                               int* nadj_rec_local,
-                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {
+                                              realw* h_source_adjoint,
+                                              int* nrec,
+                                              int* nadj_rec_local,
+                                              int* NTSTEP_BETWEEN_READ_ADJSRC, 
+                                              int* it) {
 
   TRACE("\tadd_sources_el_sim_type_2_or_3");
 
@@ -282,64 +290,20 @@ void FC_FUNC_(add_sources_el_sim_type_2_or_3,
   dim3 grid(num_blocks_x,num_blocks_y,1);
   dim3 threads(5,5,5);
 
-  // build slice of adj_sourcearrays because full array is *very* large.
-  // note: this extracts array values for local adjoint sources at given time step "time_index"
-  //          from large adj_sourcearrays array into h_adj_sourcearrays_slice
-  int ispec,i,j,k,irec_local,it_index;
-
-  it_index = (*time_index) - 1;
-  irec_local = 0;
-
-  for(int irec = 0; irec < *nrec; irec++) {
-    if (mp->myrank == h_islice_selected_rec[irec]) {
-      // takes only elastic sources
-      ispec = h_ispec_selected_rec[irec] - 1;
-
-      // only for elastic elements
-      if (h_ispec_is_elastic[ispec]){
-
-        for(k=0;k<NGLLX;k++) {
-          for(j=0;j<NGLLX;j++) {
-            for(i=0;i<NGLLX;i++) {
-
-              mp->h_adj_sourcearrays_slice[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,0,irec_local)]
-                      = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
-                                                  *NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLX,
-                                                  irec_local,it_index,0,i,j,k)];
-
-              mp->h_adj_sourcearrays_slice[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,1,irec_local)]
-                      = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
-                                                  *NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLX,
-                                                  irec_local,it_index,1,i,j,k)];
-
-              mp->h_adj_sourcearrays_slice[INDEX5(NGLLX,NGLLX,NGLLX,NDIM,i,j,k,2,irec_local)]
-                      = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
-                                                  *NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLX,
-                                                  irec_local,it_index,2,i,j,k)];
-            }
-          }
-        }
-      } // h_ispec_is_elastic
-
-      // increases local receivers counter
-      irec_local++;
-    }
-  }
-  // check all local sources were added
-  if (irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
-
+  int it_index = *NTSTEP_BETWEEN_READ_ADJSRC - (*it-1) % *NTSTEP_BETWEEN_READ_ADJSRC - 1 ;
   // copies extracted array values onto GPU
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
-                                    (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),98001);
+  if ( (*it-1) % *NTSTEP_BETWEEN_READ_ADJSRC==0) print_CUDA_error_if_any(cudaMemcpy(mp->d_source_adjoint,h_source_adjoint,
+                                                                mp->nadj_rec_local*3*sizeof(realw)*(*NTSTEP_BETWEEN_READ_ADJSRC),cudaMemcpyHostToDevice),99099);
+
 
 
-  // the irec_local variable needs to be precomputed (as
-  // h_pre_comp..), because normally it is in the loop updating accel,
-  // and due to how it's incremented, it cannot be parallelized
 
   add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
-                                                                               *nrec,
-                                                                               mp->d_adj_sourcearrays,
+                                                                               *nrec,it_index,*NTSTEP_BETWEEN_READ_ADJSRC,
+                                                                               mp->d_source_adjoint,
+                                                                               mp->d_hxir,
+                                                                               mp->d_hetar,
+                                                                               mp->d_hgammar,
                                                                                mp->d_ibool,
                                                                                mp->d_ispec_is_elastic,
                                                                                mp->d_ispec_selected_rec,

diff --git a/src/cuda/mesh_constants_cuda.h b/src/cuda/mesh_constants_cuda.h
@@ -234,14 +234,14 @@ typedef texture<float, cudaTextureType1D, cudaReadModeElementType> realw_texture
 //   however, compiler tends to use texture loads for restricted memory arrays, which might slow down performance
 //
 // non-restricted (default)
-typedef const realw* realw_const_p;
+//typedef const realw* realw_const_p;
 // restricted
-//typedef const realw* __restrict__ realw_const_p;
+typedef const realw* __restrict__ realw_const_p;
 //
 // non-restricted (default)
-typedef realw* realw_p;
+//typedef realw* realw_p;
 // restricted
-//typedef realw* __restrict__ realw_p;
+typedef realw* __restrict__ realw_p;
 
 // wrapper for global memory load function
 // usage:  val = get_global_cr( &A[index] );
@@ -408,11 +408,11 @@ typedef struct mesh_ {
   realw* d_station_seismo_field;
   realw* h_station_seismo_field;
 
-  double* d_hxir, *d_hetar, *d_hgammar;
+  realw* d_hxir, *d_hetar, *d_hgammar;
   double* d_dxd, *d_dyd, *d_dzd;
   double* d_vxd, *d_vyd, *d_vzd;
   double* d_axd, *d_ayd, *d_azd;
-  realw* d_seismograms_d, *d_seismograms_v, *d_seismograms_a;
+  realw* d_seismograms_d, *d_seismograms_v, *d_seismograms_a, *d_seismograms_p;
   double* d_nu;
 
   realw* h_seismograms_d_it;
@@ -421,6 +421,7 @@ typedef struct mesh_ {
 
   // adjoint receivers/sources
   int nadj_rec_local;
+  realw* d_source_adjoint;
   realw* d_adj_sourcearrays;
   realw* h_adj_sourcearrays_slice;
   int* d_pre_computed_irec;