From 9e323f267f2fb8d81e177470ece8a8eebefc5fa9 Mon Sep 17 00:00:00 2001 From: "giovanni.lacopo" Date: Thu, 9 Nov 2023 10:55:16 +0100 Subject: [PATCH] Stacking/Reduce on GPU with reduced communication --- Makefile | 61 ++++++-- allvars_nccl.h | 2 - gridding_nccl.cpp => gridding_nccl.cu | 48 ++++-- w-stacking.cu | 216 ++++++++++++++++---------- w-stacking.h | 11 +- 5 files changed, 221 insertions(+), 117 deletions(-) rename gridding_nccl.cpp => gridding_nccl.cu (90%) diff --git a/Makefile b/Makefile index fde90c6..f6c49b1 100644 --- a/Makefile +++ b/Makefile @@ -59,9 +59,9 @@ OPT += -DPHASE_ON #OPT += -DNORMALIZE_UVW # Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL -OPT += -DGAUSS_HI_PRECISION +#OPT += -DGAUSS_HI_PRECISION -#OPT += -DGAUSS +OPT += -DGAUSS #OPT += -DKAISERBESSEL @@ -77,7 +77,7 @@ OPT += -DGAUSS_HI_PRECISION #OPT += -DCUDACC # use GPU acceleration via OMP -OPT += -DACCOMP +#OPT += -DACCOMP # perform stacking on GPUs #OPT += -DGPU_STACKING @@ -125,7 +125,7 @@ DEPS = w-stacking.h main.c allvars.h # ----- define which files will be compiled by MPICC # # these are the OBJS that will be compiled by C compiler if no acceleration (neither with CUDA nor with OpenMP) is provided -CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o +CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform_3d.o result.o numa.o reduce.o w-stacking.o phase_correction.o # these are the OBJs that will be compiled by the normal MPICC compiler if GPU acceleration is switched on CC_OBJ_ACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o @@ -138,17 +138,26 @@ OBJ_ACC_CUDA = phase_correction.o w-stacking.o # ----- define which files will be compiled by NVC with OMP offloading for wither Nvidia or AMD # -DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c -OBJ_ACC_OMP = phase_correction.o w-stacking.o +DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c +OBJ_ACC_OMP = phase_correction.o w-stacking.o + # ----- define what files will be compiled by NVC with OMP offloading when the stacking reduce is # offloaded on GPU +ifeq (CUDACC,$(findstring CUDACC,$(OPT))) +DEPS_NCCL_REDUCE = gridding_nccl.cu +OBJ_NCCL_REDUCE = gridding_nccl.o + +DEPS_RCCL_REDUCE = gridding_rccl.cu +OBJ_RCCL_REDUCE = gridding_rccl.o +else DEPS_NCCL_REDUCE = gridding_nccl.cpp OBJ_NCCL_REDUCE = gridding_nccl.o DEPS_RCCL_REDUCE = gridding_rccl.cpp OBJ_RCCL_REDUCE = gridding_rccl.o +endif # ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT # @@ -196,6 +205,12 @@ phase_correction.c: phase_correction.cu cuda_fft.cpp: cuda_fft.cu cp cuda_fft.cu cuda_fft.cpp + +gridding_nccl.cpp: gridding_nccl.cu + cp gridding_nccl.cu gridding_nccl.cpp + +gridding_rccl.cpp: gridding_rccl.cu + cp gridding_rccl.cu gridding_rccl.cpp else w-stacking.c: w-stacking.cu rm -f w-stacking.c @@ -206,6 +221,12 @@ phase_correction.c: phase_correction.cu cuda_fft.cpp: cuda_fft.cu rm -f cuda_fft.cpp touch cuda_fft.cpp +gridding_nccl.cpp: gridding_nccl.cu + rm -f gridding_nccl.cpp + touch gridding_nccl.cpp +gridding_rccl.cpp: gridding_rccl.cu + rm -f gridding_rccl.cpp + touch gridding_rccl.cpp endif @@ -221,7 +242,7 @@ LINKER=$(MPIC++) FLAGS=$(OPTIMIZE) LIBS=$(NVLIB) $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA) - $(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c *.cu $(LIBS) + $(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c w-stacking.cu phase_correction.cu $(LIBS) OBJ += $(OBJ_ACC_CUDA) endif @@ -256,6 +277,17 @@ endif ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT))) + +ifeq (CUDACC,$(findstring CUDACC,$(OPT))) +EXEC_EXT := $(EXEC_EXT)_acc-reduce +LINKER=$(MPIC++) +FLAGS=$(OPTIMIZE) +LIBS=$(NVLIB_3) +$(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE) + $(NVCC) $(OPT_NVCC) $(OPT) -c $^ $(LIBS) +OBJ += $(OBJ_NCCL_REDUCE) +else + EXEC_EXT := $(EXEC_EXT)_acc-reduce LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) @@ -264,6 +296,7 @@ $(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE) $(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS) OBJ += $(OBJ_NCCL_REDUCE) endif +endif ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-reduce @@ -319,9 +352,11 @@ clean: rm -f w-stacking.c rm -f phase_correction.c rm -f cuda_fft.cpp - -cleanall: - rm -f $(EXEC)$(EXT) - rm -f *.o - rm -f w-stacking.c - rm -f phase_correction.c + rm -f gridding_nccl.cpp + rm -f gridding_rccl.cpp + +#cleanall: +# rm -f $(EXEC)$(EXT) +# rm -f *.o +# rm -f w-stacking.c +# rm -f phase_correction.c diff --git a/allvars_nccl.h b/allvars_nccl.h index 032a26e..67b5ed6 100644 --- a/allvars_nccl.h +++ b/allvars_nccl.h @@ -23,8 +23,6 @@ #include #endif - - #if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw #include #endif diff --git a/gridding_nccl.cpp b/gridding_nccl.cu similarity index 90% rename from gridding_nccl.cpp rename to gridding_nccl.cu index 556ee6d..67c8897 100644 --- a/gridding_nccl.cpp +++ b/gridding_nccl.cu @@ -1,4 +1,3 @@ - #include "allvars_nccl.h" #include "proto.h" #include @@ -72,7 +71,7 @@ void gridding_data(){ //Initialize nccl - double * grid_gpu, *gridss_gpu; + double *grid_gpu, *gridss_gpu; int local_rank = 0; uint64_t hostHashs[size]; @@ -87,7 +86,8 @@ void gridding_data(){ ncclUniqueId id; ncclComm_t comm; - cudaStream_t stream_reduce; + cudaError_t nnn; + cudaStream_t stream_reduce, stream_stacking; if (rank == 0) ncclGetUniqueId(&id); MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); @@ -97,6 +97,7 @@ void gridding_data(){ cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); cudaStreamCreate(&stream_reduce); + cudaStreamCreate(&stream_stacking); ncclCommInitRank(&comm, size, id, rank); @@ -189,17 +190,38 @@ void gridding_data(){ #ifdef VERBOSE printf("Processing sector %ld\n",isector); #endif - - + double *stacking_target_array; if ( size > 1 ) stacking_target_array = gridss; else stacking_target_array = grid; + start = CPU_TIME_wt; //We have to call different GPUs per MPI task!!! [GL] + #ifdef CUDACC + wstack(param.num_w_planes, + Nsec, + metaData.freq_per_chan, + metaData.polarisations, + uus, + vvs, + wws, + visreals, + visimgs, + weightss, + dx, + dw, + param.w_support, + xaxis, + yaxis, + gridss_gpu, + param.num_threads, + rank, + stream_stacking); + #else wstack(param.num_w_planes, Nsec, metaData.freq_per_chan, @@ -218,13 +240,12 @@ void gridding_data(){ stacking_target_array, param.num_threads, rank); + #endif //Allocate memory on devices non-blocking for the host /////////////////////////////////////////////////////// timing_wt.kernel += CPU_TIME_wt - start; - - cudaMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice, stream_reduce); #ifdef VERBOSE printf("Processed sector %ld\n",isector); @@ -239,8 +260,6 @@ void gridding_data(){ // int target_rank = (int)isector; it implied that size >= nsectors int target_rank = (int)(isector % size); - cudaStreamSynchronize(stream_reduce); - start = CPU_TIME_wt; ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); @@ -248,26 +267,25 @@ void gridding_data(){ timing_wt.reduce += CPU_TIME_wt - start; - // Go to next sector - memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); - } + nnn = cudaMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); + if (nnn != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", nnn);} + } free(memory); } //Copy data back from device to host (to be deleted in next steps) - cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); MPI_Barrier(MPI_COMM_WORLD); cudaStreamSynchronize(stream_reduce); - cudaFree(gridss_gpu); cudaFree(grid_gpu); - + cudaFree(gridss_gpu); cudaStreamDestroy(stream_reduce); + cudaStreamDestroy(stream_stacking); ncclCommDestroy(comm); diff --git a/w-stacking.cu b/w-stacking.cu index a01344c..98804ca 100644 --- a/w-stacking.cu +++ b/w-stacking.cu @@ -98,88 +98,101 @@ void makeKaiserBesselKernel(double * kernel, //} __global__ void convolve_g( - int num_w_planes, - uint num_points, - uint freq_per_chan, - uint polarizations, - double* uu, - double* vv, - double* ww, - float* vis_real, - float* vis_img, - float* weight, - double dx, - double dw, - int KernelLen, - int grid_size_x, - int grid_size_y, - double* grid, - double std22) + int num_w_planes, + uint num_points, + uint freq_per_chan, + uint polarizations, + double* uu, + double* vv, + double* ww, + float* vis_real, + float* vis_img, + float* weight, + double dx, + double dw, + int KernelLen, + int grid_size_x, + int grid_size_y, + double* grid, + double std22) + + { - //printf("DENTRO AL KERNEL\n"); - uint gid = blockIdx.x*blockDim.x + threadIdx.x; - if(gid < num_points) - { - uint i = gid; - uint visindex = i*freq_per_chan*polarizations; - double norm = std22/PI; + //printf("DENTRO AL KERNEL\n"); + uint gid = blockIdx.x*blockDim.x + threadIdx.x; + if(gid < num_points) + { + uint i = gid; + uint visindex = i*freq_per_chan*polarizations; + double norm = std22/PI; - int j, k; + int j, k; - /* Convert UV coordinates to grid coordinates. */ - double pos_u = uu[i] / dx; - double pos_v = vv[i] / dx; - double ww_i = ww[i] / dw; - int grid_w = (int)ww_i; - int grid_u = (int)pos_u; - int grid_v = (int)pos_v; + /* Convert UV coordinates to grid coordinates. */ + double pos_u = uu[i] / dx; + double pos_v = vv[i] / dx; + double ww_i = ww[i] / dw; - // check the boundaries - uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; - uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; - uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; - uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; - //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax); + int grid_w = (int)ww_i; + int grid_u = (int)pos_u; + int grid_v = (int)pos_v; + // check the boundaries + uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; + uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; + uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; + uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; - // Convolve this point onto the grid. - for (k = kmin; k <= kmax; k++) - { - double v_dist = (double)k+0.5 - pos_v; + // Convolve this point onto the grid. + for (k = kmin; k <= kmax; k++) + { - for (j = jmin; j <= jmax; j++) + double v_dist = (double)k - pos_v; + int increaseprecision = 5; + + for (j = jmin; j <= jmax; j++) { - double u_dist = (double)j+0.5 - pos_u; - uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); - //printf("--> %ld %d %d %d\n",iKer,j,k,grid_w); - - double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); - // Loops over frequencies and polarizations - double add_term_real = 0.0; - double add_term_img = 0.0; - uint ifine = visindex; - for (uint ifreq=0; ifreq>> ( + convolve_g <<>> ( num_w_planes, num_points, freq_per_chan, @@ -304,20 +337,33 @@ void wstack( KernelLen, grid_size_x, grid_size_y, - grid_g, - std22); + grid, + std22 + ); - mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost); + mmm=cudaStreamSynchronize(stream_stacking); + //Record the event + //mmm=cudaEventRecord(event_kernel,stream_stacking); + + //Wait until the kernel ends + //mmm=cudaStreamWaitEvent(stream_stacking,event_kernel); + //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); + printf("CUDA ERROR %s\n",cudaGetErrorString(mmm)); + mmm=cudaFree(uu_g); mmm=cudaFree(vv_g); mmm=cudaFree(ww_g); mmm=cudaFree(vis_real_g); mmm=cudaFree(vis_img_g); mmm=cudaFree(weight_g); - mmm=cudaFree(grid_g); - + //mmm=cudaFree(grid_g); + /* + #if !defined(GAUSS_HI_PRECISION) + mmm=cudaFree(convkernel_g); + #endif + */ // Switch between CUDA and GPU versions # else diff --git a/w-stacking.h b/w-stacking.h index c953ee6..c4b3aaa 100644 --- a/w-stacking.h +++ b/w-stacking.h @@ -32,10 +32,11 @@ void wstack( int, double*, int, - int); + int, + cudaStream_t); } -#else +#else void wstack( int, unsigned int, @@ -58,6 +59,7 @@ void wstack( #endif + #ifdef __CUDACC__ extern "C" #endif @@ -81,6 +83,11 @@ void phase_correction( int, int); +double gauss_kernel_norm( + double norm, + double std22, + double u_dist, + double v_dist); #ifdef ACCOMP #pragma omp declare target (gauss_kernel_norm) -- GitLab