From 3835bd76c5a7cdce320458c2076ef10c0bb93a1f Mon Sep 17 00:00:00 2001 From: "giovanni.lacopo" Date: Wed, 13 Sep 2023 16:23:36 +0200 Subject: [PATCH] Checking GPU scalability --- gridding_nccl.cpp | 7 ++++--- phase_correction.cu | 6 ++++-- w-stacking.cu | 18 +++++++++--------- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/gridding_nccl.cpp b/gridding_nccl.cpp index c1a246b..556ee6d 100644 --- a/gridding_nccl.cpp +++ b/gridding_nccl.cpp @@ -190,14 +190,15 @@ void gridding_data(){ printf("Processing sector %ld\n",isector); #endif - start = CPU_TIME_wt; - + double *stacking_target_array; if ( size > 1 ) stacking_target_array = gridss; else stacking_target_array = grid; - + + start = CPU_TIME_wt; + //We have to call different GPUs per MPI task!!! [GL] wstack(param.num_w_planes, Nsec, diff --git a/phase_correction.cu b/phase_correction.cu index 644f59d..1e882fe 100644 --- a/phase_correction.cu +++ b/phase_correction.cu @@ -234,13 +234,15 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in } #else + omp_set_default_device(rank % omp_get_num_devices()); + #if !defined(__clang__) - #pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) device(rank % omp_get_num_devices()) + #pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) #else - #pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) device(rank % omp_get_num_devices()) + #pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) #endif for (int iw=0; iw #include #include -#include "errcodes.h" #ifdef __CUDACC__ #include "allvars_nccl.h" -#else -#include "allvars.h" #endif #include "proto.h" @@ -209,7 +206,7 @@ void wstack( int rank) { uint i; - uint index; + //uint index; uint visindex; // initialize the convolution kernel @@ -220,13 +217,14 @@ void wstack( double std22 = 1.0/(2.0*std*std); double norm = std22/PI; double * convkernel = (double*)malloc(increaseprecision*w_support*sizeof(*convkernel)); - double overSamplingFactor = 1.0; - int withSinc = 0; - double alpha = 8.6; + #ifdef GAUSS makeGaussKernel(convkernel,w_support,increaseprecision,std22); #endif #ifdef KAISERBESSEL + double overSamplingFactor = 1.0; + int withSinc = 0; + double alpha = 8.6; makeKaiserBesselKernel(convkernel, w_support, increaseprecision, alpha, overSamplingFactor, withSinc); #endif @@ -328,9 +326,9 @@ void wstack( #endif #ifdef ACCOMP + omp_set_default_device(rank % omp_get_num_devices()); uint Nvis = num_points*freq_per_chan*polarizations; - // #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) - // #pragma omp target teams distribute parallel for map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y]) + #pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y]) #else #pragma omp parallel for private(visindex) #endif @@ -422,6 +420,8 @@ void wstack( } } + + #pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan],grid[0:2*num_w_planes*grid_size_x*grid_size_y]) // End switch between CUDA and CPU versions #endif //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); -- GitLab