diff --git a/Makefile b/Makefile index 61afb16b9a7132c84da5e7c269204bd52c4904b0..5384ac45203b7884b42f0e7be71d23f886dfca99 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ FFTWLIBS = OPT += -DUSE_FFTW # use omp-ized version of fftw routines -OPT += -DHYBRID_FFTW +#OPT += -DHYBRID_FFTW # write the full 3D cube of gridded visibilities and its FFT transform #OPT += -DWRITE_DATA @@ -73,10 +73,10 @@ OPT += -DPHASE_ON #OPT += -DNVIDIA #use cuda for GPUs -#OPT += -DCUDACC +OPT += -DCUDACC # use GPU acceleration via OMP -OPT += -DACCOMP +#OPT += -DACCOMP # use NVIDIA GPU to perform the reduce #OPT += -DNCCL_REDUCE @@ -85,7 +85,7 @@ OPT += -DACCOMP #OPT += -DRCCL_REDUCE # use GPU to perform FFT -#OPT += -DCUFFTMP +OPT += -DCUFFTMP #support for AMD GPUs #OPT += __HIP_PLATFORM_AMD__ @@ -141,8 +141,13 @@ OBJ_RCCL_REDUCE = gridding_rccl.o # ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT # -DEPS_ACC_CUFFTMP = w-stacking_omp.h cuda_fft.cpp +ifeq (CUDACC,$(findstring CUDACC,$(OPT))) +DEPS_ACC_CUFFTMP = cuda_fft.cu +OBJ_ACC_CUFFTMP = cuda_fft.o +else +DEPS_ACC_CUFFTMP = cuda_fft.cpp OBJ_ACC_CUFFTMP = cuda_fft.o +endif # ----------------------------------------------------- @@ -173,13 +178,19 @@ w-stacking.c: w-stacking.cu phase_correction.c: phase_correction.cu cp phase_correction.cu phase_correction.c + +cuda_fft.cpp: cuda_fft.cu + cp cuda_fft.cu cuda_fft.cpp else w-stacking.c: w-stacking.cu - rm -f w-stacking.cun + rm -f w-stacking.c touch w-stacking.c phase_correction.c: phase_correction.cu rm -f phase_correction.c touch phase_correction.c +cuda_fft.cpp: cuda_fft.cu + rm -f cuda_fft.cpp + touch cuda_fft.cpp endif @@ -192,7 +203,7 @@ endif ifeq (CUDACC,$(findstring CUDACC,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-cuda LINKER=$(MPIC++) -FLAGS=$(OPTIMIZE) +FLAGS=$(OPTIMIZE) LIBS=$(NVLIB) $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA) $(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c *.cu $(LIBS) @@ -250,15 +261,28 @@ OBJ += $(OBJ_RCCL_REDUCE) endif ifeq (CUFFTMP,$(findstring CUFFTMP,$(OPT))) + +ifeq (CUDACC,$(findstring CUDACC,$(OPT))) +EXEC_EXT := $(EXEC_EXT)_acc-fft +LINKER=$(MPIC++) +FLAGS=$(OPTIMIZE) +LIBS=$(NVLIB_2) +$(OBJ_ACC_CUFFTMP): $(DEPS_ACC_CUFFTMP) + $(NVCC) $(OPT_NVCC) $(OPT) -c $^ $(LIBS) +OBJ += $(OBJ_ACC_CUFFTMP) + +else + EXEC_EXT := $(EXEC_EXT)_acc-fft LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) -LIBS=$(NVLIB) $(NVLIB_2) +LIBS=$(NVLIB_2) $(OBJ_ACC_CUFFTMP): $(DEPS_ACC_CUFFTMP) $(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS) OBJ += $(OBJ_ACC_CUFFTMP) endif +endif ################################################################################### @@ -270,11 +294,11 @@ w-stacking: $(OBJ) $(DEPS) Makefile %.o: %.c $(DEPS) $(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) - clean: rm -f *.o rm -f w-stacking.c rm -f phase_correction.c + rm -f cuda_fft.cpp cleanall: rm -f $(EXEC)$(EXT) diff --git a/allvars.h b/allvars.h index 8c8372257bcf1b552347914cd3371c1c93cbca26..9ebf91ad6d69f5d1a02c823485b7aecfa3d31b4e 100644 --- a/allvars.h +++ b/allvars.h @@ -34,9 +34,6 @@ #include "w-stacking.h" #endif -#if defined(CUDACC) -#include -#endif #if defined(NVIDIA) #include diff --git a/allvars_nccl.h b/allvars_nccl.h index b48763ff98c19dd49faf674c481d0720a2eb0b60..032a26ef4f6b0f57d058827b9ee72310dfc6c999 100644 --- a/allvars_nccl.h +++ b/allvars_nccl.h @@ -13,7 +13,7 @@ #include -#if !defined( NCCL_REDUCE ) +#if !defined( NCCL_REDUCE ) && !defined(__CUDACC__) #include #endif diff --git a/main.c b/main.c index 4643c6345d954201ca25aafe05956766b0d213f4..d94e0b9c1b4f3b217e16149b6be51da8eec46842 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,6 @@ #include "allvars.h" #include "proto.h" - void shutdown_wstacking( int errcode, char *message, char *fname, int linenum ) { if ( ( rank == 0 ) && @@ -77,25 +76,6 @@ int main(int argc, char * argv[]) FFT_INIT; - #if defined(CUDACC) - int ndevices; - cudaGetDeviceCount(&ndevices); - cudaSetDevice(rank % ndevices); - - if ( rank == 0 ) { - if (0 == ndevices) { - - shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ ); - } - - - printf("Running rank %d/%d using GPU %d\n", rank, size, rank % ndevices); - #ifdef NVIDIA - prtAccelInfo(); - #endif - } - #endif - #ifdef ACCOMP if ( rank == 0 ) { if (0 == omp_get_num_devices()) { diff --git a/phase_correction.cu b/phase_correction.cu index 37974d4f0027296c83578c0d22af674118928239..644f59d67ef8bb60b4dcbf974ba3be1f0905b149 100644 --- a/phase_correction.cu +++ b/phase_correction.cu @@ -11,6 +11,8 @@ #include #include #include +#include "errcodes.h" +#include "proto.h" #ifdef __CUDACC__ @@ -106,7 +108,22 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in int Nth = NTHREADS; long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth/nbucket) + 1; if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; - + + int ndevices; + cudaGetDeviceCount(&ndevices); + cudaSetDevice(rank % ndevices); + + if ( rank == 0 ) { + if (0 == ndevices) { + + shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ ); + } + + } + printf("Running rank %d using GPU %d\n", rank, rank % ndevices); + #ifdef NVIDIA + prtAccelInfo(); + #endif cudaError_t mmm; double * image_real_g; diff --git a/w-stacking.cu b/w-stacking.cu index d5aae633623201675006e897b5bbc612998f2957..c11f622614bbc1d49b37ef63f741a21584757fff 100644 --- a/w-stacking.cu +++ b/w-stacking.cu @@ -5,7 +5,15 @@ #include #include #include +#include "errcodes.h" + +#ifdef __CUDACC__ +#include "allvars_nccl.h" +#else #include "allvars.h" +#endif + +#include "proto.h" #ifdef ACCOMP #pragma omp declare target @@ -109,8 +117,7 @@ __global__ void convolve_g( int grid_size_x, int grid_size_y, double* grid, - double std22, - int rank) + double std22) { //printf("DENTRO AL KERNEL\n"); @@ -226,12 +233,29 @@ void wstack( // Loop over visibilities. // Switch between CUDA and GPU versions -#ifdef __CUDACC__ + #ifdef __CUDACC__ // Define the CUDA set up int Nth = NTHREADS; uint Nbl = (uint)(num_points/Nth) + 1; if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; uint Nvis = num_points*freq_per_chan*polarizations; + + int ndevices; + cudaGetDeviceCount(&ndevices); + cudaSetDevice(rank % ndevices); + + if ( rank == 0 ) { + if (0 == ndevices) { + + shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ ); + } + } + + printf("Running rank %d/%d using GPU %d\n", rank, size, rank % ndevices); + #ifdef NVIDIA + prtAccelInfo(); + #endif + printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl); // Create GPU arrays and offload them @@ -264,7 +288,7 @@ void wstack( mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice); mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice); if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpy ERROR %d !!!\n", mmm);} - + // Call main GPU Kernel convolve_g <<>> ( num_w_planes, @@ -396,12 +420,13 @@ void wstack( grid[iKer+1] += add_term_img; } } - + } -// End switch between CUDA and CPU versions + // End switch between CUDA and CPU versions #endif //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); } + #ifdef ACCOMP #pragma omp end declare target #endif diff --git a/w-stacking.h b/w-stacking.h index c74fb06f770359e89c04346459c87329096ac2a9..f06bbbc0ae51dbea747afdb4d805d0c1cb6b8ece 100644 --- a/w-stacking.h +++ b/w-stacking.h @@ -8,8 +8,6 @@ #include - - #ifdef __CUDACC__ extern "C" #endif