Checking GPU scalability

3835bd76 · Giovanni Lacopo · 889e3c95 · 3835bd76 · 3835bd76 · 3835bd76
Commit 3835bd76 authored 1 year ago by Giovanni Lacopo
--- a/gridding_nccl.cpp
+++ b/gridding_nccl.cpp
@@ -190,7 +190,6 @@ void gridding_data(){
      printf("Processing sector %ld\n",isector);
     #endif

-      start = CPU_TIME_wt;
   
      double *stacking_target_array;
      if ( size > 1 )
@@ -198,6 +197,8 @@ void gridding_data(){
      else
 	stacking_target_array = grid;

+      start = CPU_TIME_wt;
+	    
     //We have to call different GPUs per MPI task!!! [GL]
      wstack(param.num_w_planes,
 	     Nsec,

--- a/phase_correction.cu
+++ b/phase_correction.cu
@@ -234,13 +234,15 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
 	}

 #else
+	omp_set_default_device(rank % omp_get_num_devices());
+	
       #if !defined(__clang__)

-       #pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) device(rank % omp_get_num_devices())
+       #pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis])

       #else

-       #pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) device(rank % omp_get_num_devices())
+       #pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis])
       #endif
 	
 	for (int iw=0; iw<num_w_planes; iw++)

--- a/w-stacking.cu
+++ b/w-stacking.cu
@@ -5,12 +5,9 @@
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include "errcodes.h"

 #ifdef __CUDACC__
 #include "allvars_nccl.h"
-#else
-#include "allvars.h"
 #endif

 #include "proto.h"
@@ -209,7 +206,7 @@ void wstack(
     int rank)
 {
    uint i;
-    uint index;
+    //uint index;
    uint visindex;

    // initialize the convolution kernel
@@ -220,13 +217,14 @@ void wstack(
    double std22 = 1.0/(2.0*std*std);
    double norm = std22/PI;
    double * convkernel = (double*)malloc(increaseprecision*w_support*sizeof(*convkernel));
-    double overSamplingFactor = 1.0;
-    int withSinc = 0;
-    double alpha = 8.6;
+
    #ifdef GAUSS
    makeGaussKernel(convkernel,w_support,increaseprecision,std22);
    #endif
    #ifdef KAISERBESSEL
+    double overSamplingFactor = 1.0;
+    int withSinc = 0;
+    double alpha = 8.6;
    makeKaiserBesselKernel(convkernel, w_support, increaseprecision, alpha, overSamplingFactor, withSinc);
    #endif

@@ -328,9 +326,9 @@ void wstack(
 #endif

 #ifdef ACCOMP
+    omp_set_default_device(rank % omp_get_num_devices());
    uint Nvis = num_points*freq_per_chan*polarizations;
-  //  #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan])
-  //  #pragma omp target teams distribute parallel for  map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y])
+   #pragma omp target teams distribute parallel for private(visindex)  map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y])
 #else
    #pragma omp parallel for private(visindex)
 #endif
@@ -422,6 +420,8 @@ void wstack(
        }
 	
    }
+
+    #pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan],grid[0:2*num_w_planes*grid_size_x*grid_size_y]) 
    // End switch between CUDA and CPU versions
 #endif
    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);