From 404e0296d3a267344dec9bd0e4f5832b78f77386 Mon Sep 17 00:00:00 2001 From: "giovanni.lacopo" Date: Mon, 18 Sep 2023 11:25:47 +0200 Subject: [PATCH] Choose whether performing stacking on CPUs or GPUs --- Makefile | 15 +++++++++------ w-stacking.cu | 4 ++-- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 1be9de6..fde90c6 100644 --- a/Makefile +++ b/Makefile @@ -59,9 +59,9 @@ OPT += -DPHASE_ON #OPT += -DNORMALIZE_UVW # Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL -#OPT += -DGAUSS_HI_PRECISION +OPT += -DGAUSS_HI_PRECISION -OPT += -DGAUSS +#OPT += -DGAUSS #OPT += -DKAISERBESSEL @@ -77,7 +77,10 @@ OPT += -DGAUSS #OPT += -DCUDACC # use GPU acceleration via OMP -#OPT += -DACCOMP +OPT += -DACCOMP + +# perform stacking on GPUs +#OPT += -DGPU_STACKING # use NVIDIA GPU to perform the reduce #OPT += -DNCCL_REDUCE @@ -135,8 +138,8 @@ OBJ_ACC_CUDA = phase_correction.o w-stacking.o # ----- define which files will be compiled by NVC with OMP offloading for wither Nvidia or AMD # -DEPS_ACC_OMP = w-stacking_omp.h phase_correction.c w-stacking_omp.c -OBJ_ACC_OMP = phase_correction.o w-stacking_omp.o +DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c +OBJ_ACC_OMP = phase_correction.o w-stacking.o # ----- define what files will be compiled by NVC with OMP offloading when the stacking reduce is @@ -174,7 +177,7 @@ endif ifeq (USE_FFTW,$(findstring USE_FFTW,$(OPT))) CFLAGS += $(FFTW_MPI_INC) -ifeq (HIBRYD_FFTW,$(findstring HYBRID_FFTW,$(OPT))) +ifeq (HYBRID_FFTW,$(findstring HYBRID_FFTW,$(OPT))) FFTWLIBS = $(FFTW_MPI_LIB) -lfftw3_omp -lfftw3_mpi -lfftw3 -lm else FFTWLIBS = $(FFTW_MPI_LIB) -lfftw3_mpi -lfftw3 -lm diff --git a/w-stacking.cu b/w-stacking.cu index 309fd6e..a01344c 100644 --- a/w-stacking.cu +++ b/w-stacking.cu @@ -325,7 +325,7 @@ void wstack( omp_set_num_threads(num_threads); #endif -#ifdef ACCOMP + #if defined(ACCOMP) && (GPU_STACKING) omp_set_default_device(rank % omp_get_num_devices()); uint Nvis = num_points*freq_per_chan*polarizations; #pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom:grid[0:2*num_w_planes*grid_size_x*grid_size_y]) @@ -420,7 +420,7 @@ void wstack( } } - #ifdef ACCOMP + #if defined(ACCOMP) && (GPU_STACKING) #pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan], grid[0:2*num_w_planes*grid_size_x*grid_size_y]) #endif // End switch between CUDA and CPU versions -- GitLab