diff --git a/Build/Makefile.leo b/Build/Makefile.leo index ce350a18f3de19fa7b3f91c9238bfd7d6f0a3d1a..2f19b24f5d6c5d177261cb52bb46e5a143874425 100644 --- a/Build/Makefile.leo +++ b/Build/Makefile.leo @@ -9,8 +9,14 @@ OPT_PURE_MPI = -O4 -march=native -mavx -mavx2 OMP_GPU = -mp=multicore,gpu -gpu=cuda11.8 -gpu=cc80 -CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/include -CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/targets/x86_64-linux/lib/stubs +###CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/include +###CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/targets/x86_64-linux/lib/stubs + + +CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/cuda/12.3/include +CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/cuda/12.3/lib -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/cuda/12.3/targets/x86_64-linux/lib/stubs + + FFTW_INCL= FFTW_LIB= @@ -19,18 +25,33 @@ FFTW_LIB= ########################################################## #NVIDIA CUFFTMP -CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64 -CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/include/cufftmp +###CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64 +###CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/include/cufftmp + +CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/math_libs/12.3/include/cufftmp +CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/math_libs/12.3/lib64 + + ########################################################## -NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/include/ -NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/ +###NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/include/ +###NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/ + +NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nvshmem/include +NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nvshmem/lib + ########################################################## #NVIDIA NCCL REDUCE -NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/include -NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/lib +###NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/include +###NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/lib + + +NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nccl/include +NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nccl/lib + + ########################################################## NVC = nvc diff --git a/allvars.h b/allvars.h index 5ee68b1374734496aeb08814bb9729737205ecd2..88e062e0cf96e9777acf0ef06c53b039492121a7 100755 --- a/allvars.h +++ b/allvars.h @@ -175,4 +175,3 @@ extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *grid extern MPI_Comm MYMPI_COMM_WORLD; extern MPI_Win slabwin; - diff --git a/allvars_nccl.h b/allvars_nccl.h index b67b03295fb2c04b1e7f995634447d09552f8e40..d102d6835b548d11e3569e340cedd3321a115039 100755 --- a/allvars_nccl.h +++ b/allvars_nccl.h @@ -173,4 +173,3 @@ extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *grid extern MPI_Comm MYMPI_COMM_WORLD; extern MPI_Win slabwin; - diff --git a/cuda_fft.cu b/cuda_fft.cu index 8efae0a334105942caecba4c556eb4afdff442f8..3a28d999365d823faa6efcc8179e3ebc49555f00 100755 --- a/cuda_fft.cu +++ b/cuda_fft.cu @@ -89,7 +89,10 @@ void cuda_fft( // Alloco fftwgrid su GPU utilizzando cudaMalloc - mmm=cudaMalloc(&fftwgrid, sizeof(cufftDoubleComplex)*yaxis*xaxis); + + long long unsigned size_finta_fft = (long long unsigned)((long long unsigned)xaxis*(long long unsigned)yaxis); + + mmm=cudaMalloc(&fftwgrid, (size_t)(size_finta_fft*sizeof(cufftDoubleComplex))); if (mmm != cudaSuccess) {printf("!!! cuda_fft.cu cudaMalloc ERROR %d !!!\n", mmm);} int Nth = 32; @@ -149,7 +152,7 @@ void cuda_fft( cudaStreamSynchronize(stream); //Copy the array to be transformed onto the descriptor structure array - mmm = cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice); + mmm = cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, (size_t)(size_finta_fft*sizeof(cufftDoubleComplex)), cudaMemcpyDeviceToDevice); if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 1 ERROR %d !!!\n", mmm);} //Perform the FFT @@ -166,7 +169,7 @@ void cuda_fft( if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);} //Copy the result descriptor structure array again onto the original fftwgrid - mmm = cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice); + mmm = cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], (size_t)(size_finta_fft*sizeof(cufftDoubleComplex)), cudaMemcpyDeviceToDevice); if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 2 ERROR %d !!!\n", mmm);} //Write gridss starting from fftwgrid diff --git a/gridding_nccl.cu b/gridding_nccl.cu index b10cb5cc513a0e8f40b514060b722349d7f935a9..36b1d2cd4322b12c77efc4d59ac90ee144b4440f 100755 --- a/gridding_nccl.cu +++ b/gridding_nccl.cu @@ -95,10 +95,12 @@ void gridding_data(){ cudaSetDevice(local_rank); - nnn = cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); + long long unsigned size_finta = (long long unsigned)(2*(long long unsigned)param.num_w_planes*(long long unsigned)xaxis*(long long unsigned)yaxis); + + nnn = cudaMalloc(&grid_gpu, (size_t)(size_finta*sizeof(double))); if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &grid_gpu ERROR %d !!!\n", nnn);} - nnn = cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); + nnn = cudaMalloc(&gridss_gpu, (size_t)(size_finta*sizeof(double))); if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &gridss_gpu ERROR %d !!!\n", nnn);} nnn = cudaStreamCreate(&stream_reduce); @@ -202,27 +204,27 @@ void gridding_data(){ start = CPU_TIME_wt; //We have to call different GPUs per MPI task!!! [GL] - #ifdef CUDACC - wstack(param.num_w_planes, - Nsec, - metaData.freq_per_chan, - metaData.polarisations, - uus, - vvs, - wws, - visreals, - visimgs, - weightss, - dx, - dw, - param.w_support, - xaxis, - yaxis, - gridss_gpu, - param.num_threads, - rank, - stream_stacking); - #else +#ifdef CUDACC + wstack((long long unsigned)param.num_w_planes, + Nsec, + metaData.freq_per_chan, + metaData.polarisations, + uus, + vvs, + wws, + visreals, + visimgs, + weightss, + dx, + dw, + param.w_support, + (long long unsigned)xaxis, + (long long unsigned)yaxis, + gridss_gpu, + param.num_threads, + rank, + stream_stacking); +#else wstack(param.num_w_planes, Nsec, metaData.freq_per_chan, @@ -241,7 +243,7 @@ void gridding_data(){ gridss, param.num_threads, rank); - #endif +#endif //Allocate memory on devices non-blocking for the host /////////////////////////////////////////////////////// @@ -269,7 +271,7 @@ void gridding_data(){ timing_wt.reduce += CPU_TIME_wt - start; // Go to next sector - nnn = cudaMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); + nnn = cudaMemset( gridss_gpu, 0.0, (size_t)(size_finta*sizeof(double)) ); if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMemset ERROR %d !!!\n", nnn);} } @@ -282,7 +284,7 @@ void gridding_data(){ //cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); #if !defined(CUFFTMP) - cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); + cudaMemcpyAsync(grid, grid_gpu, (size_t)(size_finta*sizeof(double)), cudaMemcpyDeviceToHost, stream_reduce); cudaStreamSynchronize(stream_reduce); #endif diff --git a/phase_correction.cu b/phase_correction.cu index 5ed1345a90294e2646984a5e67628fb0a5d05b96..9dcee918de696e9f0e30349e78d6da175078833e 100755 --- a/phase_correction.cu +++ b/phase_correction.cu @@ -30,65 +30,65 @@ __global__ void phase_g(int xaxis, double resolution, int nbucket) { - long gid = blockIdx.x*blockDim.x + threadIdx.x; - double add_term_real; - double add_term_img; - double wterm; - long arraysize = (long)((xaxis*yaxis*num_w_planes)/nbucket); - - if(gid < arraysize) + long gid = blockIdx.x*blockDim.x + threadIdx.x; + double add_term_real; + double add_term_img; + double wterm; + long arraysize = (long)((xaxis*yaxis*num_w_planes)/nbucket); + + if(gid < arraysize) + { + long gid_aux = nbucket*gid; + for(int iaux=0; iaux<nbucket; iaux++) { - long gid_aux = nbucket*gid; - for(int iaux=0; iaux<nbucket; iaux++) - { - int iw = gid_aux/(xaxis*yaxis); - int ivaux = gid_aux%(xaxis*yaxis); - int iv = ivaux/xaxis; - int iu = ivaux%xaxis; - long index = 2*gid_aux; - long img_index = iu+iv*xaxis; + int iw = gid_aux/(xaxis*yaxis); + int ivaux = gid_aux%(xaxis*yaxis); + int iv = ivaux/xaxis; + int iu = ivaux%xaxis; + long index = 2*gid_aux; + long img_index = iu+iv*xaxis; - wterm = wmin + iw*dw; + wterm = wmin + iw*dw; #ifdef PHASE_ON - if (num_w_planes > 1) - { - double xcoord = (double)(iu-xaxistot/2); - if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2); - xcoord = sin(xcoord*resolution); - double ycoord = (double)(iv-yaxistot/2); - if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2); - ycoord = sin(ycoord*resolution); - - double preal, pimag; - double radius2 = (xcoord*xcoord+ycoord*ycoord); - - preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); - pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); - - double p,q,r,s; - p = gridss[index]; - q = gridss[index+1]; - r = preal; - s = pimag; - - //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index); - - add_term_real = (p*r-q*s)*dwnorm*sqrt(1-radius2); - add_term_img = (p*s+q*r)*dwnorm*sqrt(1-radius2); - atomicAdd(&(image_real[img_index]),add_term_real); - atomicAdd(&(image_imag[img_index]),add_term_img); - } else { - atomicAdd(&(image_real[img_index]),gridss[index]); - atomicAdd(&(image_imag[img_index]),gridss[index+1]); - } + if (num_w_planes > 1) + { + double xcoord = (double)(iu-xaxistot/2); + if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2); + xcoord = sin(xcoord*resolution); + double ycoord = (double)(iv-yaxistot/2); + if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2); + ycoord = sin(ycoord*resolution); + + double preal, pimag; + double radius2 = (xcoord*xcoord+ycoord*ycoord); + + preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + + double p,q,r,s; + p = gridss[index]; + q = gridss[index+1]; + r = preal; + s = pimag; + + //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index); + + add_term_real = (p*r-q*s)*dwnorm*sqrt(1-radius2); + add_term_img = (p*s+q*r)*dwnorm*sqrt(1-radius2); + atomicAdd(&(image_real[img_index]),add_term_real); + atomicAdd(&(image_imag[img_index]),add_term_img); + } else { + atomicAdd(&(image_real[img_index]),gridss[index]); + atomicAdd(&(image_imag[img_index]),gridss[index+1]); + } #else - atomicAdd(&(image_real[img_index]),gridss[index]); - atomicAdd(&(image_imag[img_index]),gridss[index+1]); + atomicAdd(&(image_real[img_index]),gridss[index]); + atomicAdd(&(image_imag[img_index]),gridss[index+1]); #endif // end of PHASE_ON - gid_aux++; - } + gid_aux++; } + } } @@ -97,227 +97,229 @@ __global__ void phase_g(int xaxis, void phase_correction(double* gridss, double* image_real, double* image_imag, int xaxis, int yaxis, int num_w_planes, int xaxistot, int yaxistot, double resolution, double wmin, double wmax, int num_threads, int rank) { - double dw = (wmax-wmin)/(double)num_w_planes; - double wterm = wmin+0.5*dw; - double dwnorm = dw/(wmax-wmin); - + double dw = (wmax-wmin)/(double)num_w_planes; + double wterm = wmin+0.5*dw; + double dwnorm = dw/(wmax-wmin); + #ifdef CUDACC - - // WARNING: nbucket MUST be chosen such that xaxis*yaxis*num_w_planes is a multiple of nbucket - int nbucket = 1; - int Nth = NTHREADS; - long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth/nbucket) + 1; - if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; - - int ndevices; - cudaGetDeviceCount(&ndevices); - cudaSetDevice(rank % ndevices); - - if ( rank == 0 ) { - if (0 == ndevices) { - - shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ ); - } - - } - printf("Running rank %d using GPU %d\n", rank, rank % ndevices); - #ifdef NVIDIA - prtAccelInfo(); - #endif - - cudaError_t mmm; - double * image_real_g; - double * image_imag_g; - + + // WARNING: nbucket MUST be chosen such that xaxis*yaxis*num_w_planes is a multiple of nbucket + int nbucket = 1; + int Nth = NTHREADS; + long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth/nbucket) + 1; + if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; + + int ndevices; + cudaGetDeviceCount(&ndevices); + cudaSetDevice(rank % ndevices); + + if ( rank == 0 ) { + if (0 == ndevices) { + + shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ ); + } + + } + printf("Running rank %d using GPU %d\n", rank, rank % ndevices); +#ifdef NVIDIA + prtAccelInfo(); +#endif + + cudaError_t mmm; + double * image_real_g; + double * image_imag_g; + #if !defined(CUFFTMP) - double * gridss_g; - - mmm = cudaMalloc(&gridss_g, 2*num_w_planes*xaxis*yaxis*sizeof(double)); - mmm = cudaMemcpy(gridss_g, gridss, 2*num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice); + double * gridss_g; + + long long unsigned size_finta = (long long unsigned)(2*(long long unsigned)num_w_planes*(long long unsigned)xaxis*(long long unsigned)yaxis); + + mmm = cudaMalloc(&gridss_g, (size_t)(size_finta*sizeof(double))); + mmm = cudaMemcpy(gridss_g, gridss, (size_t)(size_finta*sizeof(double)), cudaMemcpyHostToDevice); #endif - - mmm=cudaMalloc(&image_real_g, xaxis*yaxis*sizeof(double)); - //printf("CUDA ERROR 2 %s\n",cudaGetErrorString(mmm)); - mmm=cudaMalloc(&image_imag_g, xaxis*yaxis*sizeof(double)); - //printf("CUDA ERROR 3 %s\n",cudaGetErrorString(mmm)); - - //printf("CUDA ERROR 4 %s\n",cudaGetErrorString(mmm)); - mmm=cudaMemset(image_real_g, 0.0, xaxis*yaxis*sizeof(double)); - //printf("CUDA ERROR 5 %s\n",cudaGetErrorString(mmm)); - mmm=cudaMemset(image_imag_g, 0.0, xaxis*yaxis*sizeof(double)); - //printf("CUDA ERROR 6 %s\n",cudaGetErrorString(mmm)); - - // call the phase correction kernel - phase_g <<<Nbl,Nth>>> (xaxis, - yaxis, - num_w_planes, + + mmm=cudaMalloc(&image_real_g, xaxis*yaxis*sizeof(double)); + //printf("CUDA ERROR 2 %s\n",cudaGetErrorString(mmm)); + mmm=cudaMalloc(&image_imag_g, xaxis*yaxis*sizeof(double)); + //printf("CUDA ERROR 3 %s\n",cudaGetErrorString(mmm)); + + //printf("CUDA ERROR 4 %s\n",cudaGetErrorString(mmm)); + mmm=cudaMemset(image_real_g, 0.0, xaxis*yaxis*sizeof(double)); + //printf("CUDA ERROR 5 %s\n",cudaGetErrorString(mmm)); + mmm=cudaMemset(image_imag_g, 0.0, xaxis*yaxis*sizeof(double)); + //printf("CUDA ERROR 6 %s\n",cudaGetErrorString(mmm)); + + // call the phase correction kernel + phase_g <<<Nbl,Nth>>> (xaxis, + yaxis, + num_w_planes, #if defined(CUFFTMP) - gridss, + gridss, #else - gridss_g, + gridss_g, #endif - image_real_g, - image_imag_g, - wmin, - dw, - dwnorm, - xaxistot, - yaxistot, - resolution, - nbucket); - - mmm = cudaMemcpy(image_real, image_real_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost); - //printf("CUDA ERROR 7 %s\n",cudaGetErrorString(mmm)); - mmm = cudaMemcpy(image_imag, image_imag_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost); - //printf("CUDA ERROR 8 %s\n",cudaGetErrorString(mmm)); - + image_real_g, + image_imag_g, + wmin, + dw, + dwnorm, + xaxistot, + yaxistot, + resolution, + nbucket); + + mmm = cudaMemcpy(image_real, image_real_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost); + //printf("CUDA ERROR 7 %s\n",cudaGetErrorString(mmm)); + mmm = cudaMemcpy(image_imag, image_imag_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost); + //printf("CUDA ERROR 8 %s\n",cudaGetErrorString(mmm)); + #if !defined(CUFFTMP) - cudaFree(gridss_g); + cudaFree(gridss_g); #else - cudaFree(gridss); + cudaFree(gridss); #endif - - + + #else - + #ifndef ACCOMP - + #ifdef _OPENMP - omp_set_num_threads(num_threads); + omp_set_num_threads(num_threads); #endif - - #pragma omp parallel for collapse(2) private(wterm) - for (int iw=0; iw<num_w_planes; iw++) - { - for (int iv=0; iv<yaxis; iv++) - for (int iu=0; iu<xaxis; iu++) - { - - unsigned int index = 2*(iu+iv*xaxis+xaxis*yaxis*iw); - unsigned int img_index = iu+iv*xaxis; - wterm = wmin + iw*dw; + +#pragma omp parallel for collapse(2) private(wterm) + for (int iw=0; iw<num_w_planes; iw++) + { + for (int iv=0; iv<yaxis; iv++) + for (int iu=0; iu<xaxis; iu++) + { + + unsigned int index = 2*(iu+iv*xaxis+xaxis*yaxis*iw); + unsigned int img_index = iu+iv*xaxis; + wterm = wmin + iw*dw; #ifdef PHASE_ON - if (num_w_planes > 1) - { - double xcoord = (double)(iu-xaxistot/2); - if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2); - xcoord = sin(xcoord*resolution); - double ycoord = (double)(iv-yaxistot/2); - if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2); - ycoord = sin(ycoord*resolution); - - double preal, pimag; - double radius2 = (xcoord*xcoord+ycoord*ycoord); - if(xcoord <= 1.0) - { - preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); - pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); - } else { - preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1)); - pimag = 0.0; - } - + if (num_w_planes > 1) + { + double xcoord = (double)(iu-xaxistot/2); + if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2); + xcoord = sin(xcoord*resolution); + double ycoord = (double)(iv-yaxistot/2); + if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2); + ycoord = sin(ycoord*resolution); + + double preal, pimag; + double radius2 = (xcoord*xcoord+ycoord*ycoord); + if(xcoord <= 1.0) + { preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); - - double p,q,r,s; - p = gridss[index]; - q = gridss[index+1]; - r = preal; - s = pimag; - - //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index); - #pragma omp atomic - image_real[img_index] += (p*r-q*s)*dwnorm*sqrt(1-radius2); - #pragma omp atomic - image_imag[img_index] += (p*s+q*r)*dwnorm*sqrt(1-radius2); - } else { - #pragma omp atomic - image_real[img_index] += gridss[index]; - #pragma omp atomic - image_imag[img_index] += gridss[index+1]; + } else { + preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1)); + pimag = 0.0; } + + preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + + double p,q,r,s; + p = gridss[index]; + q = gridss[index+1]; + r = preal; + s = pimag; + + //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index); +#pragma omp atomic + image_real[img_index] += (p*r-q*s)*dwnorm*sqrt(1-radius2); +#pragma omp atomic + image_imag[img_index] += (p*s+q*r)*dwnorm*sqrt(1-radius2); + } else { +#pragma omp atomic + image_real[img_index] += gridss[index]; +#pragma omp atomic + image_imag[img_index] += gridss[index+1]; + } #else - #pragma omp atomic - image_real[img_index] += gridss[index]; - #pragma omp atomic - image_imag[img_index] += gridss[index+1]; +#pragma omp atomic + image_real[img_index] += gridss[index]; +#pragma omp atomic + image_imag[img_index] += gridss[index+1]; #endif // end of PHASE_ON - } - } + } + } #else - omp_set_default_device(rank % omp_get_num_devices()); + omp_set_default_device(rank % omp_get_num_devices()); - #if !defined(__clang__) +#if !defined(__clang__) - #pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) +#pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) - #else +#else - #pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) - #endif +#pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) +#endif - for (int iw=0; iw<num_w_planes; iw++) - { - for (int iv=0; iv<yaxis; iv++) - for (int iu=0; iu<xaxis; iu++) - { - - long index = 2*(iu+iv*xaxis+xaxis*yaxis*iw); - long img_index = iu+iv*xaxis; - wterm = wmin + iw*dw; + for (int iw=0; iw<num_w_planes; iw++) + { + for (int iv=0; iv<yaxis; iv++) + for (int iu=0; iu<xaxis; iu++) + { + + long index = 2*(iu+iv*xaxis+xaxis*yaxis*iw); + long img_index = iu+iv*xaxis; + wterm = wmin + iw*dw; #ifdef PHASE_ON - if (num_w_planes > 1) - { - double xcoord = (double)(iu-xaxistot/2); - if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2); - xcoord = sin(xcoord*resolution); - double ycoord = (double)(iv-yaxistot/2); - if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2); - ycoord = sin(ycoord*resolution); - - double preal, pimag; - double radius2 = (xcoord*xcoord+ycoord*ycoord); - if(xcoord <= 1.0) - { - preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); - pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); - } else { - preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1)); - pimag = 0.0; - } - + if (num_w_planes > 1) + { + double xcoord = (double)(iu-xaxistot/2); + if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2); + xcoord = sin(xcoord*resolution); + double ycoord = (double)(iv-yaxistot/2); + if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2); + ycoord = sin(ycoord*resolution); + + double preal, pimag; + double radius2 = (xcoord*xcoord+ycoord*ycoord); + if(xcoord <= 1.0) + { preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); - - double p,q,r,s; - p = gridss[index]; - q = gridss[index+1]; - r = preal; - s = pimag; - - //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index); - #pragma omp atomic - image_real[img_index] += (p*r-q*s)*dwnorm*sqrt(1-radius2); - #pragma omp atomic - image_imag[img_index] += (p*s+q*r)*dwnorm*sqrt(1-radius2); - } else { - #pragma omp atomic - image_real[img_index] += gridss[index]; - #pragma omp atomic - image_imag[img_index] += gridss[index+1]; + } else { + preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1)); + pimag = 0.0; } + + preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + + double p,q,r,s; + p = gridss[index]; + q = gridss[index+1]; + r = preal; + s = pimag; + + //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index); +#pragma omp atomic + image_real[img_index] += (p*r-q*s)*dwnorm*sqrt(1-radius2); +#pragma omp atomic + image_imag[img_index] += (p*s+q*r)*dwnorm*sqrt(1-radius2); + } else { +#pragma omp atomic + image_real[img_index] += gridss[index]; +#pragma omp atomic + image_imag[img_index] += gridss[index+1]; + } #else - #pragma omp atomic - image_real[img_index] += gridss[index]; - #pragma omp atomic - image_imag[img_index] += gridss[index+1]; +#pragma omp atomic + image_real[img_index] += gridss[index]; +#pragma omp atomic + image_imag[img_index] += gridss[index+1]; #endif // end of PHASE_ON - } - } + } + } #endif #endif // end of __CUDACC__ diff --git a/w-stacking.cu b/w-stacking.cu index 22136113a182c58491db811739376bdb567d1f76..f52889a37fd597ba08328a00c9c75ad7c5efd13c 100755 --- a/w-stacking.cu +++ b/w-stacking.cu @@ -209,329 +209,328 @@ __global__ void convolve_g( #pragma omp declare target #endif void wstack( - int num_w_planes, - uint num_points, - uint freq_per_chan, - uint polarizations, - double* uu, - double* vv, - double* ww, - float* vis_real, - float* vis_img, - float* weight, - double dx, - double dw, - int w_support, - int grid_size_x, - int grid_size_y, - double* grid, - int num_threads, - #ifdef CUDACC - int rank, - cudaStream_t stream_stacking - #else - int rank - #endif - ) - + long long unsigned num_w_planes, + uint num_points, + uint freq_per_chan, + uint polarizations, + double* uu, + double* vv, + double* ww, + float* vis_real, + float* vis_img, + float* weight, + double dx, + double dw, + int w_support, + long long unsigned grid_size_x, + long long unsigned grid_size_y, + double* grid, + int num_threads, +#ifdef NCCL_REDUCE + int rank, + cudaStream_t stream_stacking +#else + int rank +#endif + ) { - uint i; - //uint index; - uint visindex; - - // initialize the convolution kernel - // gaussian: - int KernelLen = (w_support-1)/2; - int increaseprecision = 5; // this number must be odd: increaseprecison*w_support must be odd (w_support must be odd) - double std = 1.0; - double std22 = 1.0/(2.0*std*std); - double norm = std22/PI; - double * convkernel = (double*)malloc(increaseprecision*w_support*sizeof(*convkernel)); - - #ifdef GAUSS - makeGaussKernel(convkernel,w_support,increaseprecision,std22); - #endif - #ifdef KAISERBESSEL - double overSamplingFactor = 1.0; - int withSinc = 0; - double alpha = 8.6; - makeKaiserBesselKernel(convkernel, w_support, increaseprecision, alpha, overSamplingFactor, withSinc); - #endif - - - // Loop over visibilities. -// Switch between CUDA and GPU versions - #ifdef __CUDACC__ - // Define the CUDA set up - int Nth = NTHREADS; - uint Nbl = (uint)(num_points/Nth) + 1; - if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; - uint Nvis = num_points*freq_per_chan*polarizations; - - int ndevices; - cudaGetDeviceCount(&ndevices); - cudaSetDevice(rank % ndevices); - - if ( rank == 0 ) { - if (0 == ndevices) { - - shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ ); - } + uint i; + //uint index; + uint visindex; + + // initialize the convolution kernel + // gaussian: + int KernelLen = (w_support-1)/2; + int increaseprecision = 5; // this number must be odd: increaseprecison*w_support must be odd (w_support must be odd) + double std = 1.0; + double std22 = 1.0/(2.0*std*std); + double norm = std22/PI; + double * convkernel = (double*)malloc(increaseprecision*w_support*sizeof(*convkernel)); + +#ifdef GAUSS + makeGaussKernel(convkernel,w_support,increaseprecision,std22); +#endif +#ifdef KAISERBESSEL + double overSamplingFactor = 1.0; + int withSinc = 0; + double alpha = 8.6; + makeKaiserBesselKernel(convkernel, w_support, increaseprecision, alpha, overSamplingFactor, withSinc); +#endif + + + // Loop over visibilities. + // Switch between CUDA and GPU versions +#ifdef __CUDACC__ + // Define the CUDA set up + int Nth = NTHREADS; + uint Nbl = (uint)(num_points/Nth) + 1; + if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; + uint Nvis = num_points*freq_per_chan*polarizations; + + int ndevices; + cudaGetDeviceCount(&ndevices); + cudaSetDevice(rank % ndevices); + + if ( rank == 0 ) { + if (0 == ndevices) { + + shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ ); } - - #ifdef NVIDIA - prtAccelInfo(); - #endif - - // Create GPU arrays and offload them - double * uu_g; - double * vv_g; - double * ww_g; - float * vis_real_g; - float * vis_img_g; - float * weight_g; - double * convkernel_g; + } + +#ifdef NVIDIA + prtAccelInfo(); +#endif + + // Create GPU arrays and offload them + double * uu_g; + double * vv_g; + double * ww_g; + float * vis_real_g; + float * vis_img_g; + float * weight_g; + double * convkernel_g; #if !defined(NCCL_REDUCE) - double * grid_g; + double * grid_g; #endif #if !defined(NCCL_REDUCE) - cudaStream_t stream_stacking; - cudaStreamCreate(&stream_stacking); + cudaStream_t stream_stacking; + cudaStreamCreate(&stream_stacking); #endif - - //Create the event inside stream stacking - //cudaEvent_t event_kernel; - - //for (int i=0; i<100000; i++)grid[i]=23.0; - cudaError_t mmm; - //mmm=cudaEventCreate(&event_kernel); - mmm=cudaMalloc(&uu_g,num_points*sizeof(double)); - mmm=cudaMalloc(&vv_g,num_points*sizeof(double)); - mmm=cudaMalloc(&ww_g,num_points*sizeof(double)); - mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float)); - mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float)); - mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float)); - //mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); + + //Create the event inside stream stacking + //cudaEvent_t event_kernel; + + //for (int i=0; i<100000; i++)grid[i]=23.0; + cudaError_t mmm; + //mmm=cudaEventCreate(&event_kernel); + mmm=cudaMalloc(&uu_g,num_points*sizeof(double)); + mmm=cudaMalloc(&vv_g,num_points*sizeof(double)); + mmm=cudaMalloc(&ww_g,num_points*sizeof(double)); + mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float)); + mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float)); + mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float)); + //mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); #if !defined(NCCL_REDUCE) - mmm = cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); + mmm = cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); #endif #if !defined(GAUSS_HI_PRECISION) - mmm=cudaMalloc(&convkernel_g,increaseprecision*w_support*sizeof(double)); + mmm=cudaMalloc(&convkernel_g,increaseprecision*w_support*sizeof(double)); #endif - if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMalloc ERROR %d !!!\n", mmm);} + if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMalloc ERROR %d !!!\n", mmm);} #if !defined(NCCL_REDUCE) - mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); - if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", mmm);} + mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); + if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", mmm);} #endif - mmm=cudaMemcpyAsync(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking); - mmm=cudaMemcpyAsync(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking); - mmm=cudaMemcpyAsync(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking); - mmm=cudaMemcpyAsync(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking); - mmm=cudaMemcpyAsync(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking); - mmm=cudaMemcpyAsync(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice, stream_stacking); + mmm=cudaMemcpyAsync(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking); + mmm=cudaMemcpyAsync(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking); + mmm=cudaMemcpyAsync(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking); + mmm=cudaMemcpyAsync(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking); + mmm=cudaMemcpyAsync(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking); + mmm=cudaMemcpyAsync(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice, stream_stacking); - #if !defined(GAUSS_HI_PRECISION) - mmm=cudaMemcpyAsync(convkernel_g, convkernel, increaseprecision*w_support*sizeof(double), cudaMemcpyHostToDevice, stream_stacking); - #endif +#if !defined(GAUSS_HI_PRECISION) + mmm=cudaMemcpyAsync(convkernel_g, convkernel, increaseprecision*w_support*sizeof(double), cudaMemcpyHostToDevice, stream_stacking); +#endif - if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpyAsync ERROR %d !!!\n", mmm);} + if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpyAsync ERROR %d !!!\n", mmm);} - // Call main GPU Kernel + // Call main GPU Kernel #if defined(GAUSS_HI_PRECISION) - convolve_g <<<Nbl,Nth,0,stream_stacking>>> ( - num_w_planes, - num_points, - freq_per_chan, - polarizations, - uu_g, - vv_g, - ww_g, - vis_real_g, - vis_img_g, - weight_g, - dx, - dw, - KernelLen, - grid_size_x, - grid_size_y, + convolve_g <<<Nbl,Nth,0,stream_stacking>>> ( + num_w_planes, + num_points, + freq_per_chan, + polarizations, + uu_g, + vv_g, + ww_g, + vis_real_g, + vis_img_g, + weight_g, + dx, + dw, + KernelLen, + grid_size_x, + grid_size_y, #if !defined(NCCL_REDUCE) - grid_g, + grid_g, #else - grid, + grid, #endif - std22 - ); + std22 + ); #else - convolve_g <<<Nbl,Nth,0,stream_stacking>>> ( - num_w_planes, - num_points, - freq_per_chan, - polarizations, - uu_g, - vv_g, - ww_g, - vis_real_g, - vis_img_g, - weight_g, - dx, - dw, - KernelLen, - grid_size_x, - grid_size_y, + convolve_g <<<Nbl,Nth,0,stream_stacking>>> ( + num_w_planes, + num_points, + freq_per_chan, + polarizations, + uu_g, + vv_g, + ww_g, + vis_real_g, + vis_img_g, + weight_g, + dx, + dw, + KernelLen, + grid_size_x, + grid_size_y, #if !defined(NCCL_REDUCE) - grid_g, + grid_g, #else - grid, + grid, #endif - std22, - convkernel_g - ); + std22, + convkernel_g + ); #endif - mmm=cudaStreamSynchronize(stream_stacking); - //Record the event - //mmm=cudaEventRecord(event_kernel,stream_stacking); + mmm=cudaStreamSynchronize(stream_stacking); + //Record the event + //mmm=cudaEventRecord(event_kernel,stream_stacking); - //Wait until the kernel ends - //mmm=cudaStreamWaitEvent(stream_stacking,event_kernel); + //Wait until the kernel ends + //mmm=cudaStreamWaitEvent(stream_stacking,event_kernel); - //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); + //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); #if !defined(NCCL_REDUCE) - mmm=cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost); + mmm=cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost); #endif - if (mmm != cudaSuccess) - printf("CUDA ERROR %s\n",cudaGetErrorString(mmm)); + if (mmm != cudaSuccess) + printf("CUDA ERROR %s\n",cudaGetErrorString(mmm)); - mmm=cudaFree(uu_g); - mmm=cudaFree(vv_g); - mmm=cudaFree(ww_g); - mmm=cudaFree(vis_real_g); - mmm=cudaFree(vis_img_g); - mmm=cudaFree(weight_g); + mmm=cudaFree(uu_g); + mmm=cudaFree(vv_g); + mmm=cudaFree(ww_g); + mmm=cudaFree(vis_real_g); + mmm=cudaFree(vis_img_g); + mmm=cudaFree(weight_g); #if !defined(NCCL_REDUCE) - mmm=cudaFree(grid_g); + mmm=cudaFree(grid_g); #endif #if !defined(GAUSS_HI_PRECISION) - mmm=cudaFree(convkernel_g); + mmm=cudaFree(convkernel_g); #endif -// Switch between CUDA and GPU versions + // Switch between CUDA and GPU versions # else #ifdef _OPENMP - omp_set_num_threads(num_threads); + omp_set_num_threads(num_threads); #endif - #if defined(ACCOMP) && (GPU_STACKING) - omp_set_default_device(rank % omp_get_num_devices()); - uint Nvis = num_points*freq_per_chan*polarizations; - #pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom:grid[0:2*num_w_planes*grid_size_x*grid_size_y]) +#if defined(ACCOMP) && (GPU_STACKING) + omp_set_default_device(rank % omp_get_num_devices()); + uint Nvis = num_points*freq_per_chan*polarizations; +#pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom:grid[0:2*num_w_planes*grid_size_x*grid_size_y]) #else - #pragma omp parallel for private(visindex) +#pragma omp parallel for private(visindex) #endif - for (i = 0; i < num_points; i++) + for (i = 0; i < num_points; i++) { #ifdef _OPENMP - //int tid; - //tid = omp_get_thread_num(); - //printf("%d\n",tid); + //int tid; + //tid = omp_get_thread_num(); + //printf("%d\n",tid); #endif - visindex = i*freq_per_chan*polarizations; + visindex = i*freq_per_chan*polarizations; - double sum = 0.0; - int j, k; - //if (i%1000 == 0)printf("%ld\n",i); + double sum = 0.0; + int j, k; + //if (i%1000 == 0)printf("%ld\n",i); - /* Convert UV coordinates to grid coordinates. */ - double pos_u = uu[i] / dx; - double pos_v = vv[i] / dx; - double ww_i = ww[i] / dw; + /* Convert UV coordinates to grid coordinates. */ + double pos_u = uu[i] / dx; + double pos_v = vv[i] / dx; + double ww_i = ww[i] / dw; - int grid_w = (int)ww_i; - int grid_u = (int)pos_u; - int grid_v = (int)pos_v; + int grid_w = (int)ww_i; + int grid_u = (int)pos_u; + int grid_v = (int)pos_v; - // check the boundaries - uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; - uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; - uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; - uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; - //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax); + // check the boundaries + uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; + uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; + uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; + uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; + //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax); - // Convolve this point onto the grid. - for (k = kmin; k <= kmax; k++) + // Convolve this point onto the grid. + for (k = kmin; k <= kmax; k++) { - //double v_dist = (double)k+0.5 - pos_v; - double v_dist = (double)k - pos_v; + //double v_dist = (double)k+0.5 - pos_v; + double v_dist = (double)k - pos_v; - for (j = jmin; j <= jmax; j++) + for (j = jmin; j <= jmax; j++) { - //double u_dist = (double)j+0.5 - pos_u; - double u_dist = (double)j - pos_u; - uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); - int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen))); - int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen))); - - #ifdef GAUSS_HI_PRECISION - double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); - #endif - #ifdef GAUSS - double conv_weight = convkernel[jKer]*convkernel[kKer]; - //if(jKer < 0 || jKer >= 35 || kKer < 0 || kKer >= 35) - // printf("%f %d %f %d\n",fabs(u_dist+(double)KernelLen),jKer,fabs(v_dist+(double)KernelLen),kKer); - //printf("%d %d %d %d %f %f %f %f %f\n",jKer, j, kKer, k, pos_u, pos_v, u_dist,v_dist,conv_weight); - #endif - #ifdef KAISERBESSEL - double conv_weight = convkernel[jKer]*convkernel[kKer]; - #endif - // Loops over frequencies and polarizations - double add_term_real = 0.0; - double add_term_img = 0.0; - uint ifine = visindex; - // DAV: the following two loops are performend by each thread separately: no problems of race conditions - for (uint ifreq=0; ifreq<freq_per_chan; ifreq++) + //double u_dist = (double)j+0.5 - pos_u; + double u_dist = (double)j - pos_u; + uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); + int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen))); + int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen))); + +#ifdef GAUSS_HI_PRECISION + double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); +#endif +#ifdef GAUSS + double conv_weight = convkernel[jKer]*convkernel[kKer]; + //if(jKer < 0 || jKer >= 35 || kKer < 0 || kKer >= 35) + // printf("%f %d %f %d\n",fabs(u_dist+(double)KernelLen),jKer,fabs(v_dist+(double)KernelLen),kKer); + //printf("%d %d %d %d %f %f %f %f %f\n",jKer, j, kKer, k, pos_u, pos_v, u_dist,v_dist,conv_weight); +#endif +#ifdef KAISERBESSEL + double conv_weight = convkernel[jKer]*convkernel[kKer]; +#endif + // Loops over frequencies and polarizations + double add_term_real = 0.0; + double add_term_img = 0.0; + uint ifine = visindex; + // DAV: the following two loops are performend by each thread separately: no problems of race conditions + for (uint ifreq=0; ifreq<freq_per_chan; ifreq++) { - uint iweight = visindex/freq_per_chan; - for (uint ipol=0; ipol<polarizations; ipol++) - { + uint iweight = visindex/freq_per_chan; + for (uint ipol=0; ipol<polarizations; ipol++) + { if (!isnan(vis_real[ifine])) - { - //printf("%f %ld\n",weight[iweight],iweight); - add_term_real += weight[iweight] * vis_real[ifine] * conv_weight; - add_term_img += weight[iweight] * vis_img[ifine] * conv_weight; - //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations); - } + { + //printf("%f %ld\n",weight[iweight],iweight); + add_term_real += weight[iweight] * vis_real[ifine] * conv_weight; + add_term_img += weight[iweight] * vis_img[ifine] * conv_weight; + //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations); + } ifine++; iweight++; - } + } } - // DAV: this is the critical call in terms of correctness of the results and of performance - #pragma omp atomic - grid[iKer] += add_term_real; - #pragma omp atomic - grid[iKer+1] += add_term_img; + // DAV: this is the critical call in terms of correctness of the results and of performance +#pragma omp atomic + grid[iKer] += add_term_real; +#pragma omp atomic + grid[iKer+1] += add_term_img; } } } - #if defined(ACCOMP) && (GPU_STACKING) - #pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan], grid[0:2*num_w_planes*grid_size_x*grid_size_y]) - #endif - // End switch between CUDA and CPU versions +#if defined(ACCOMP) && (GPU_STACKING) +#pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan], grid[0:2*num_w_planes*grid_size_x*grid_size_y]) +#endif + // End switch between CUDA and CPU versions #endif - //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); + //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); } #ifdef ACCOMP @@ -540,8 +539,8 @@ void wstack( int test(int nnn) { - int mmm; + int mmm; - mmm = nnn+1; - return mmm; + mmm = nnn+1; + return mmm; } diff --git a/w-stacking.h b/w-stacking.h index c4b3aaa8c50bfc4663dbd6da98c20ba365e76bfa..845e923562bb7444365462ba5ad3e92cff252863 100755 --- a/w-stacking.h +++ b/w-stacking.h @@ -15,7 +15,7 @@ extern "C" #ifdef __cplusplus extern "C" { void wstack( - int, + long long unsigned, unsigned int, unsigned int, unsigned int, @@ -28,14 +28,13 @@ void wstack( double, double, int, - int, - int, + long long unsigned, + long long unsigned, double*, int, int, cudaStream_t); } - #else void wstack( int,