diff --git a/Makefile b/Makefile index cfa29569c6eb59db4e077c4a4329e1719872cfd1..b083ad76df666a7315de05d6999796851d4dfcca 100755 --- a/Makefile +++ b/Makefile @@ -63,11 +63,11 @@ OPT += -DPHASE_ON # SELECT THE GRIDDING KERNEL: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL -OPT += -DGAUSS_HI_PRECISION +#OPT += -DGAUSS_HI_PRECISION #OPT += -DGAUSS -#OPT += -DKAISERBESSEL +OPT += -DKAISERBESSEL # ======================================================== @@ -77,16 +77,16 @@ OPT += -DGAUSS_HI_PRECISION #OPT += -DNVIDIA # use CUDA for GPUs -OPT += -DCUDACC +#OPT += -DCUDACC # use GPU acceleration via OMP #OPT += -DACCOMP # perform stacking on GPUs -OPT += -DGPU_STACKING +#OPT += -DGPU_STACKING # use NVIDIA GPU to perform the reduce -OPT += -DNCCL_REDUCE +#OPT += -DNCCL_REDUCE # use GPU to perform FFT #OPT += -DCUFFTMP diff --git a/allvars.c b/allvars.c index 0f43b70840905185b5938cc5512c0432ad99de5f..89ee345e28b0a8befaf042b4a40b10847a060282 100755 --- a/allvars.c +++ b/allvars.c @@ -18,12 +18,12 @@ char datapath[LONGNAME_LEN]; int xaxis, yaxis; int rank; int size; -uint nsectors; -uint startrow; +myuint nsectors; +myuint startrow; double resolution, dx, dw, w_supporth; -uint **sectorarray = NULL; -uint *histo_send = NULL; +myuint **sectorarray = NULL; +myuint *histo_send = NULL; int verbose_level = 0; timing_t timing_wt; @@ -31,12 +31,8 @@ double reduce_mpi_time; double reduce_shmem_time; -uint size_of_grid; +myuint size_of_grid; double *grid_pointers = NULL, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w, *grid_gpu, *gridss_gpu; MPI_Comm MYMPI_COMM_WORLD; MPI_Win slabwin; - - - - diff --git a/allvars.h b/allvars.h index c434035ba89fcfe91c553c624c46e172854cb988..889403e210e0fd608f88dc8bc51ae342ceb8f887 100755 --- a/allvars.h +++ b/allvars.h @@ -73,8 +73,8 @@ typedef double float_t; typedef float float_t; #endif -typedef unsigned int uint; -typedef unsigned long long ull; +typedef unsigned int myuint; +typedef unsigned long long myull; extern struct io @@ -104,6 +104,8 @@ extern struct op char outfile2[NAME_LEN]; char outfile3[NAME_LEN]; char fftfile[NAME_LEN]; + char gridded_writedata1[NAME_LEN]; + char gridded_writedata2[NAME_LEN]; char fftfile_writedata1[NAME_LEN]; char fftfile_writedata2[NAME_LEN]; char fftfile2[NAME_LEN]; @@ -111,21 +113,21 @@ extern struct op char logfile[NAME_LEN]; char extension[NAME_LEN]; char timingfile[NAME_LEN]; - + } out, outparam; extern struct meta { - unsigned long Nmeasures; - unsigned long Nvis; - uint Nweights; - uint freq_per_chan; - uint polarisations; - uint Ntimes; + myuint Nmeasures; + myull Nvis; + myuint Nweights; + myuint freq_per_chan; + myuint polarisations; + myuint Ntimes; double dt; double thours; - uint baselines; + myuint baselines; double uvmin; double uvmax; double wmin; @@ -161,16 +163,16 @@ extern char datapath[LONGNAME_LEN]; extern int xaxis, yaxis; extern int rank; extern int size; -extern uint nsectors; -extern uint startrow; +extern myuint nsectors; +extern myuint startrow; extern double_t resolution, dx, dw, w_supporth; -extern uint **sectorarray; -extern uint *histo_send; +extern myuint **sectorarray; +extern myuint *histo_send; extern int verbose_level; -extern uint size_of_grid; +extern myuint size_of_grid; extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w, *grid_gpu, *gridss_gpu; extern MPI_Comm MYMPI_COMM_WORLD; diff --git a/allvars_nccl.h b/allvars_nccl.h index 225968efd31c13bfbb6dd2219d19ca1117819e2e..bab1ebdea151972f87e6b1e9fa0ee91c9cc30118 100755 --- a/allvars_nccl.h +++ b/allvars_nccl.h @@ -71,8 +71,8 @@ typedef double float_t; typedef float float_t; #endif -typedef unsigned int uint; -typedef unsigned long long ull; +typedef unsigned int myuint; +typedef unsigned long long myull; extern struct io @@ -102,6 +102,8 @@ extern struct op char outfile2[NAME_LEN]; char outfile3[NAME_LEN]; char fftfile[NAME_LEN]; + char gridded_writedata1[NAME_LEN]; + char gridded_writedata2[NAME_LEN]; char fftfile_writedata1[NAME_LEN]; char fftfile_writedata2[NAME_LEN]; char fftfile2[NAME_LEN]; @@ -109,21 +111,21 @@ extern struct op char logfile[NAME_LEN]; char extension[NAME_LEN]; char timingfile[NAME_LEN]; - + } out, outparam; extern struct meta { - unsigned long Nmeasures; - unsigned long Nvis; - uint Nweights; - uint freq_per_chan; - uint polarisations; - uint Ntimes; + myuint Nmeasures; + myull Nvis; + myuint Nweights; + myuint freq_per_chan; + myuint polarisations; + myuint Ntimes; double dt; double thours; - uint baselines; + myuint baselines; double uvmin; double uvmax; double wmin; @@ -159,16 +161,16 @@ extern char datapath[LONGNAME_LEN]; extern int xaxis, yaxis; extern int rank; extern int size; -extern uint nsectors; -extern uint startrow; +extern myuint nsectors; +extern myuint startrow; extern double_t resolution, dx, dw, w_supporth; -extern uint **sectorarray; -extern uint *histo_send; +extern myuint **sectorarray; +extern myuint *histo_send; extern int verbose_level; -extern uint size_of_grid; +extern myuint size_of_grid; extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w, *grid_gpu, *gridss_gpu; extern MPI_Comm MYMPI_COMM_WORLD; diff --git a/allvars_rccl.h b/allvars_rccl.h index 6e09fcae29beb32ac26369abbad053b426fb5a16..5e9691effa40db742c0422d5bfa1a09e4ae67e2c 100755 --- a/allvars_rccl.h +++ b/allvars_rccl.h @@ -74,7 +74,7 @@ typedef double float_ty; typedef float float_ty; #endif -typedef unsigned int uint; +typedef unsigned int myuint; typedef unsigned long long ull; @@ -105,6 +105,8 @@ extern struct op char outfile2[NAME_LEN]; char outfile3[NAME_LEN]; char fftfile[NAME_LEN]; + char gridded_writedata1[NAME_LEN]; + char gridded_writedata2[NAME_LEN]; char fftfile_writedata1[NAME_LEN]; char fftfile_writedata2[NAME_LEN]; char fftfile2[NAME_LEN]; @@ -112,21 +114,21 @@ extern struct op char logfile[NAME_LEN]; char extension[NAME_LEN]; char timingfile[NAME_LEN]; - + } out, outparam; extern struct meta { - uint Nmeasures; - uint Nvis; - uint Nweights; - uint freq_per_chan; - uint polarisations; - uint Ntimes; + myuint Nmeasures; + myuint Nvis; + myuint Nweights; + myuint freq_per_chan; + myuint polarisations; + myuint Ntimes; double dt; double thours; - uint baselines; + myuint baselines; double uvmin; double uvmax; double wmin; @@ -162,16 +164,16 @@ extern char datapath[LONGNAME_LEN]; extern int xaxis, yaxis; extern int rank; extern int size; -extern uint nsectors; -extern uint startrow; +extern myuint nsectors; +extern myuint startrow; extern double_ty resolution, dx, dw, w_supporth; -extern uint **sectorarray; -extern uint *histo_send; +extern myuint **sectorarray; +extern myuint *histo_send; extern int verbose_level; -extern uint size_of_grid; +extern myuint size_of_grid; extern double_ty *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w; extern MPI_Comm MYMPI_COMM_WORLD; diff --git a/cuda_fft.cu b/cuda_fft.cu index 3a28d999365d823faa6efcc8179e3ebc49555f00..c23f8302fb0ef7b90201471fc69e38c8ca3a5c4c 100755 --- a/cuda_fft.cu +++ b/cuda_fft.cu @@ -96,7 +96,7 @@ void cuda_fft( if (mmm != cudaSuccess) {printf("!!! cuda_fft.cu cudaMalloc ERROR %d !!!\n", mmm);} int Nth = 32; - unsigned int Nbl = (unsigned int)((yaxis*xaxis)/Nth + 1); + myuint Nbl = (myuint)((yaxis*xaxis)/Nth + 1); // Plan creation diff --git a/data/paramfile.txt b/data/paramfile.txt index 3b6991582618a1fd2d0d955130338c36e3134b91..3e509ce790a23d4c57ccdcd7914350ab201eac70 100755 --- a/data/paramfile.txt +++ b/data/paramfile.txt @@ -1,13 +1,13 @@ -ndatasets 1 + ndatasets 1 Datapath1 ./data/tail01_L720378_SB001_uv_12DFF03B0t_121MHz_12DFF03BFt_143MHz_120ch_flag.binMS/ #Datapath2 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_123MHz.pre-cal.binMS/ #Datapath3 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_125MHz.pre-cal.binMS/ num_threads 1 w_support 7 reduce_method 0 -grid_size_x 16384 -grid_size_y 16384 -num_w_planes 32 +grid_size_x 4096 +grid_size_y 4096 +num_w_planes 4 ufile ucoord.bin vfile vcoord.bin wfile wcoord.bin @@ -20,6 +20,8 @@ outfile1 coords.txt outfile2 grid_real.bin outfile3 grid_img.bin fftfile fft.txt +gridded_writedata1 gridded_data_real.bin +gridded_writedata2 gridded_data_img.bin fftfile_writedata1 ffted_real.bin fftfile_writedata2 ffted_img.bin fftfile2 fft_real.bin diff --git a/fourier_transform.c b/fourier_transform.c index 088889c30b68f4d06115e5b24e4a6d9ede118252..0b432aa81607d2c4e60a1bc8c31dfefb20c6c68b 100755 --- a/fourier_transform.c +++ b/fourier_transform.c @@ -35,8 +35,8 @@ void fftw_data ( void ) fftwgrid = fftw_alloc_complex(alloc_local); plan = fftw_mpi_plan_dft_2d(param.grid_size_y, param.grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); - uint fftwindex = 0; - uint fftwindex2D = 0; + myuint fftwindex = 0; + myuint fftwindex2D = 0; for (int iw=0; iw<param.num_w_planes; iw++) { //printf("FFTing plan %d\n",iw); @@ -147,7 +147,7 @@ void write_fftw_data(){ MPI_Win_lock(MPI_LOCK_SHARED,isector,0,writewin); MPI_Get(gridss_w,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,writewin); MPI_Win_unlock(isector,writewin); - for (uint i=0; i<size_of_grid/2; i++) + for (myuint i=0; i<size_of_grid/2; i++) { gridss_real[i] = gridss_w[2*i]; gridss_img[i] = gridss_w[2*i+1]; @@ -158,8 +158,8 @@ void write_fftw_data(){ for (int iv=0; iv<yaxis; iv++) for (int iu=0; iu<xaxis; iu++) { - uint global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); - uint index = iu + iv*xaxis + iw*xaxis*yaxis; + myuint global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); + myuint index = iu + iv*xaxis + iw*xaxis*yaxis; fseek(file.pFilereal, global_index, SEEK_SET); fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal); } @@ -167,8 +167,8 @@ void write_fftw_data(){ for (int iv=0; iv<yaxis; iv++) for (int iu=0; iu<xaxis; iu++) { - uint global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); - uint index = iu + iv*xaxis + iw*xaxis*yaxis; + myuint global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); + myuint index = iu + iv*xaxis + iw*xaxis*yaxis; fseek(file.pFileimg, global_index, SEEK_SET); fwrite(&gridss_img[index], 1, sizeof(double), file.pFileimg); } @@ -186,7 +186,7 @@ void write_fftw_data(){ for (int iu=0; iu<grid_size_x; iu++) { int isector = 0; - uint index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y); + myuint index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y); double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]); fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm); } @@ -255,8 +255,8 @@ void write_fftw_data(){ if(rank == 0)printf("WRITING IMAGE\n"); #ifdef FITSIO - uint * fpixel = (uint *) malloc(sizeof(uint)*naxis); - uint * lpixel = (uint *) malloc(sizeof(uint)*naxis); + myuint * fpixel = (myuint *) malloc(sizeof(myuint)*naxis); + myuint * lpixel = (myuint *) malloc(sizeof(myuint)*naxis); #endif #ifdef FITSIO diff --git a/gridding.c b/gridding.c index 10cf812837ad873e1f36ab6b11885f9173ae96e8..8e40b48a28ebe6f6336ea6ec2aaa88ccaef60b44 100755 --- a/gridding.c +++ b/gridding.c @@ -4,7 +4,7 @@ -void free_array ( uint *, uint **, int ); +void free_array ( myuint *, myuint **, int ); void initialize_array ( void ); void gridding_data ( void ); @@ -38,7 +38,7 @@ void gridding() cmp_t mygetmax = { 0 }; #pragma omp for - for (uint inorm=0; inorm<metaData.Nmeasures; inorm++) + for (myuint inorm=0; inorm<metaData.Nmeasures; inorm++) { mygetmin.u = MIN(mygetmin.u, data.uu[inorm]); mygetmin.v = MIN(mygetmin.v, data.vv[inorm]); @@ -74,7 +74,7 @@ void gridding() maxg = maxg + offset*maxg; #pragma omp parallel for num_threads(param.num_threads) - for (uint inorm=0; inorm < metaData.Nmeasures; inorm++) + for (myuint inorm=0; inorm < metaData.Nmeasures; inorm++) { data.uu[inorm] = (data.uu[inorm]+maxg) / (2.0*maxg); data.vv[inorm] = (data.vv[inorm]+maxg) / (2.0*maxg); @@ -115,14 +115,14 @@ void gridding() // ..................................................................... // -void free_array( uint *histo_send, uint **sectorarrays, int nsectors ) +void free_array( myuint *histo_send, myuint **sectorarrays, int nsectors ) // // releases memory allocated for gridding and reduce ops // { - for ( uint i = nsectors-1; i > 0; i-- ) + for ( myuint i = nsectors-1; i > 0; i-- ) free( sectorarrays[i] ); free( sectorarrays ); @@ -144,9 +144,9 @@ void initialize_array() { - histo_send = (uint*) calloc(nsectors+1, sizeof(uint)); + histo_send = (myuint*) calloc(nsectors+1, sizeof(myuint)); - for (uint iphi = 0; iphi < metaData.Nmeasures; iphi++) + for (myuint iphi = 0; iphi < metaData.Nmeasures; iphi++) { double vvh = data.vv[iphi]; //less or equal to 0.6 int binphi = (int)(vvh*nsectors); //has values expect 0 and nsectors-1. @@ -164,15 +164,15 @@ void initialize_array() histo_send[binphi-1]++; } - sectorarray = (uint**)malloc ((nsectors+1) * sizeof(uint*)); - uint *counter = (uint*) calloc (nsectors+1, sizeof(uint)); - for(uint sec=0; sec<(nsectors+1); sec++) + sectorarray = (myuint**)malloc ((nsectors+1) * sizeof(myuint*)); + myuint *counter = (myuint*) calloc (nsectors+1, sizeof(myuint)); + for(myuint sec=0; sec<(nsectors+1); sec++) { - sectorarray[sec] = (uint*)malloc(histo_send[sec]*sizeof(uint)); + sectorarray[sec] = (myuint*)malloc(histo_send[sec]*sizeof(myuint)); } - for (uint iphi = 0; iphi < metaData.Nmeasures; iphi++) + for (myuint iphi = 0; iphi < metaData.Nmeasures; iphi++) { double vvh = data.vv[iphi]; int binphi = (int)(vvh*nsectors); @@ -208,15 +208,18 @@ void write_gridded_data() { #ifdef WRITE_DATA - /* -#error "This routine has not yet been harmonized with new stuf !!" + // Write gridded results + MPI_Win slabwin; + MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin); + MPI_Win_fence(0,slabwin); + if (rank == 0) { printf("WRITING GRIDDED DATA\n"); - file.pFilereal = fopen (out.outfile2,"wb"); - file.pFileimg = fopen (out.outfile3,"wb"); + file.pFilereal = fopen (out.gridded_writedata1,"wb"); + file.pFileimg = fopen (out.gridded_writedata2,"wb"); for (int isector=0; isector<nsectors; isector++) { @@ -227,7 +230,7 @@ void write_gridded_data() MPI_Get(gridss,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,slabwin); MPI_Win_unlock(isector,slabwin); #endif - for (uint i=0; i<size_of_grid/2; i++) + for (myuint i=0; i<size_of_grid/2; i++) { gridss_real[i] = gridss[2*i]; gridss_img[i] = gridss[2*i+1]; @@ -238,8 +241,8 @@ void write_gridded_data() for (int iv=0; iv<yaxis; iv++) for (int iu=0; iu<xaxis; iu++) { - uint global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); - uint index = iu + iv*xaxis + iw*xaxis*yaxis; + myuint global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); + myuint index = iu + iv*xaxis + iw*xaxis*yaxis; fseek(file.pFilereal, global_index, SEEK_SET); fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal); } @@ -247,8 +250,8 @@ void write_gridded_data() for (int iv=0; iv<yaxis; iv++) for (int iu=0; iu<xaxis; iu++) { - uint global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); - uint index = iu + iv*xaxis + iw*xaxis*yaxis; + myuint global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); + myuint index = iu + iv*xaxis + iw*xaxis*yaxis; fseek(file.pFileimg, global_index, SEEK_SET); fwrite(&gridss_img[index], 1, sizeof(double), file.pFileimg); //double v_norm = sqrt(gridss[index]*gridss[index]+gridss[index+1]*gridss[index+1]); @@ -260,8 +263,8 @@ void write_gridded_data() { for (int iw=0; iw<param.num_w_planes; iw++) { - uint global_index = (xaxis*isector*yaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); - uint index = iw*xaxis*yaxis; + myuint global_index = (xaxis*isector*yaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); + myuint index = iw*xaxis*yaxis; fseek(file.pFilereal, global_index, SEEK_SET); fwrite(&gridss_real[index], xaxis*yaxis, sizeof(double), file.pFilereal); fseek(file.pFileimg, global_index, SEEK_SET); @@ -269,13 +272,12 @@ void write_gridded_data() } } } - fclose(file.pFilereal); fclose(file.pFileimg); } MPI_Win_fence(0,slabwin); - */ + MPI_Win_free(&slabwin); + MPI_Barrier(MPI_COMM_WORLD); #endif //WRITE_DATA - } diff --git a/gridding_cpu.c b/gridding_cpu.c index dab71a93bfa377a2ab4c01ec8460d3224aae5b4b..29cc9408cf6a69930abf77e0bd503aa3ed39632f 100755 --- a/gridding_cpu.c +++ b/gridding_cpu.c @@ -85,14 +85,14 @@ void gridding_data() // find the largest value in histo_send[] // - for (uint isector = 0; isector < nsectors; isector++) + for (myuint isector = 0; isector < nsectors; isector++) { double start = CPU_TIME_wt; - uint Nsec = histo_send[isector]; - uint Nweightss = Nsec*metaData.polarisations; - uint Nvissec = Nweightss*metaData.freq_per_chan; + myuint Nsec = histo_send[isector]; + myuint Nweightss = Nsec*metaData.polarisations; + myull Nvissec = Nweightss*metaData.freq_per_chan; double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); @@ -109,26 +109,26 @@ void gridding_data() // select data for this sector - uint icount = 0; - uint ip = 0; - uint inu = 0; + myuint icount = 0; + myuint ip = 0; + myuint inu = 0; #warning "this loop should be threaded" #warning "the counter of this loop should not be int" for( int iphi = histo_send[isector]-1; iphi >=0 ; iphi--) { - uint ilocal = sectorarray[isector][iphi]; + myuint ilocal = sectorarray[isector][iphi]; uus[icount] = data.uu[ilocal]; vvs[icount] = data.vv[ilocal]-isector*shift; wws[icount] = data.ww[ilocal]; - for (uint ipol=0; ipol<metaData.polarisations; ipol++) + for (myuint ipol=0; ipol<metaData.polarisations; ipol++) { weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol]; ip++; } - for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) + for (myuint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) { visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; @@ -150,7 +150,7 @@ void gridding_data() double my_vvmax = -1e20; #pragma omp for - for (uint ipart=0; ipart<Nsec; ipart++) + for (myuint ipart=0; ipart<Nsec; ipart++) { my_uumin = MIN(my_uumin, uus[ipart]); my_uumax = MAX(my_uumax, uus[ipart]); diff --git a/gridding_nccl.cu b/gridding_nccl.cu index 36b1d2cd4322b12c77efc4d59ac90ee144b4440f..8c0fd0077a0c66c1f53a2f46c48d930eac92810e 100755 --- a/gridding_nccl.cu +++ b/gridding_nccl.cu @@ -111,14 +111,14 @@ void gridding_data(){ ncclCommInitRank(&comm, size, id, rank); - for (uint isector = 0; isector < nsectors; isector++) + for (myuint isector = 0; isector < nsectors; isector++) { double start = CPU_TIME_wt; - uint Nsec = histo_send[isector]; - uint Nweightss = Nsec*metaData.polarisations; - uint Nvissec = Nweightss*metaData.freq_per_chan; + myuint Nsec = histo_send[isector]; + myuint Nweightss = Nsec*metaData.polarisations; + myull Nvissec = Nweightss*metaData.freq_per_chan; double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); @@ -135,26 +135,26 @@ void gridding_data(){ // select data for this sector - uint icount = 0; - uint ip = 0; - uint inu = 0; + myuint icount = 0; + myuint ip = 0; + myuint inu = 0; #warning "this loop should be threaded" #warning "the counter of this loop should not be int" for(int iphi = histo_send[isector]-1; iphi>=0; iphi--) { - uint ilocal = sectorarray[isector][iphi]; + myuint ilocal = sectorarray[isector][iphi]; uus[icount] = data.uu[ilocal]; vvs[icount] = data.vv[ilocal]-isector*shift; wws[icount] = data.ww[ilocal]; - for (uint ipol=0; ipol<metaData.polarisations; ipol++) + for (myuint ipol=0; ipol<metaData.polarisations; ipol++) { weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol]; ip++; } - for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) + for (myuint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) { visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; @@ -177,7 +177,7 @@ void gridding_data(){ double my_vvmax = -1e20; #pragma omp for - for (uint ipart=0; ipart<Nsec; ipart++) + for (myuint ipart=0; ipart<Nsec; ipart++) { my_uumin = MIN(my_uumin, uus[ipart]); my_uumax = MAX(my_uumax, uus[ipart]); diff --git a/gridding_rccl.cpp b/gridding_rccl.cpp deleted file mode 100644 index 8d0efd5e1bb500eb3551199f931873bcef2b0ced..0000000000000000000000000000000000000000 --- a/gridding_rccl.cpp +++ /dev/null @@ -1,273 +0,0 @@ -#include "allvars_rccl.h" -#include "proto.h" -#include <hip/hip_runtime.h> -#include <rccl.h> - -/* - * Implements the gridding of data via GPU - * by using NCCL library - * - */ - - -#if defined( RCCL_REDUCE ) - -/* -#define NCCLCHECK(cmd) do { -ncclResult_t r = cmd; -if (r!= ncclSuccess) { - printf("Failed, NCCL error %s:%d '%s'\n", - __FILE__,__LINE__,ncclGetErrorString(r)); - exit(EXIT_FAILURE); - } -} while(0) -*/ - -static uint64_t getHostHash(const char* string) { - // Based on DJB2a, result = result * 33 ^ char - uint64_t result = 5381; - for (int c = 0; string[c] != '\0'; c++){ - result = ((result << 5) + result) ^ string[c]; - } - return result; -} - - -static void getHostName(char* hostname, int maxlen) { - gethostname(hostname, maxlen); - for (int i=0; i< maxlen; i++) { - if (hostname[i] == '.') { - hostname[i] = '\0'; - return; - } - } -} - - - - -void gridding_data(){ - - double shift = (double)(dx*yaxis); - - timing_wt.kernel = 0.0; - timing_wt.reduce = 0.0; - timing_wt.reduce_mpi = 0.0; - timing_wt.reduce_sh = 0.0; - timing_wt.compose = 0.0; - - // calculate the resolution in radians - resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax)); - - // calculate the resolution in arcsec - double resolution_asec = (3600.0*180.0)/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax))/PI; - if ( rank == 0 ) - printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); - - - - //Initialize nccl - - double * grid_gpu, *gridss_gpu; - int local_rank = 0; - - uint64_t hostHashs[size]; - char hostname[1024]; - getHostName(hostname, 1024); - hostHashs[rank] = getHostHash(hostname); - MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD); - for (int p=0; p<size; p++) { - if (p == rank) break; - if (hostHashs[p] == hostHashs[rank]) local_rank++; - } - - ncclUniqueId id; - ncclComm_t comm; - hipStream_t stream_reduce; - - if (rank == 0) ncclGetUniqueId(&id); - MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); - - hipSetDevice(local_rank); - - hipMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); - hipMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); - hipStreamCreate(&stream_reduce); - - ncclCommInitRank(&comm, size, id, rank); - - for (uint isector = 0; isector < nsectors; isector++) - { - - double start = CPU_TIME_wt; - - uint Nsec = histo_send[isector]; - uint Nweightss = Nsec*metaData.polarisations; - uint Nvissec = Nweightss*metaData.freq_per_chan; - double_ty *memory = (double*) malloc ( (Nsec*3)*sizeof(double_ty) + - (Nvissec*2+Nweightss)*sizeof(float_ty) ); - - if ( memory == NULL ) - shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); - - double_ty *uus = (double_ty*) memory; - double_ty *vvs = (double_ty*) uus+Nsec; - double_ty *wws = (double_ty*) vvs+Nsec; - float_ty *weightss = (float_ty*)((double_t*)wws+Nsec); - float_ty *visreals = (float_ty*)weightss + Nweightss; - float_ty *visimgs = (float_ty*)visreals + Nvissec; - - - - // select data for this sector - uint icount = 0; - uint ip = 0; - uint inu = 0; - - #warning "this loop should be threaded" - #warning "the counter of this loop should not be int" - for(int iphi = histo_send[isector]-1; iphi>=0; iphi--) - { - - uint ilocal = sectorarray[isector][iphi]; - - uus[icount] = data.uu[ilocal]; - vvs[icount] = data.vv[ilocal]-isector*shift; - wws[icount] = data.ww[ilocal]; - for (uint ipol=0; ipol<metaData.polarisations; ipol++) - { - weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol]; - ip++; - } - for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) - { - visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; - visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; - - inu++; - } - icount++; - } - - double uumin = 1e20; - double vvmin = 1e20; - double uumax = -1e20; - double vvmax = -1e20; - - #pragma omp parallel reduction( min: uumin, vvmin) reduction( max: uumax, vvmax) num_threads(param.num_threads) - { - double my_uumin = 1e20; - double my_vvmin = 1e20; - double my_uumax = -1e20; - double my_vvmax = -1e20; - - #pragma omp for - for (uint ipart=0; ipart<Nsec; ipart++) - { - my_uumin = MIN(my_uumin, uus[ipart]); - my_uumax = MAX(my_uumax, uus[ipart]); - my_vvmin = MIN(my_vvmin, vvs[ipart]); - my_vvmax = MAX(my_vvmax, vvs[ipart]); - } - - uumin = MIN( uumin, my_uumin ); - uumax = MAX( uumax, my_uumax ); - vvmin = MIN( vvmin, my_vvmin ); - vvmax = MAX( vvmax, my_vvmax ); - } - - timing_wt.compose += CPU_TIME_wt - start; - - //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); - - // Make convolution on the grid - - #ifdef VERBOSE - printf("Processing sector %ld\n",isector); - #endif - - start = CPU_TIME_wt; - - double *stacking_target_array; - if ( size > 1 ) - stacking_target_array = gridss; - else - stacking_target_array = grid; - - //We have to call different GPUs per MPI task!!! [GL] - wstack(param.num_w_planes, - Nsec, - metaData.freq_per_chan, - metaData.polarisations, - uus, - vvs, - wws, - visreals, - visimgs, - weightss, - dx, - dw, - param.w_support, - xaxis, - yaxis, - stacking_target_array, - param.num_threads, - rank); - //Allocate memory on devices non-blocking for the host - /////////////////////////////////////////////////////// - - - timing_wt.kernel += CPU_TIME_wt - start; - - hipMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyHostToDevice, stream_reduce); - - #ifdef VERBOSE - printf("Processed sector %ld\n",isector); - #endif - - - if( size > 1 ) - { - - // Write grid in the corresponding remote slab - - // int target_rank = (int)isector; it implied that size >= nsectors - int target_rank = (int)(isector % size); - - - hipStreamSynchronize(stream_reduce); - - start = CPU_TIME_wt; - - ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); - hipStreamSynchronize(stream_reduce); - - timing_wt.reduce += CPU_TIME_wt - start; - - - // Go to next sector - memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); - } - - free(memory); - } - - //Copy data back from device to host (to be deleted in next steps) - - hipMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost, stream_reduce); - - MPI_Barrier(MPI_COMM_WORLD); - - hipStreamSynchronize(stream_reduce); - hipFree(gridss_gpu); - hipFree(grid_gpu); - - hipStreamDestroy(stream_reduce); - - ncclCommDestroy(comm); - - return; - -} - -#endif diff --git a/gridding_rccl.hip.cpp b/gridding_rccl.hip.cpp index e54fa4d6d0a28f83d4c42716d50036466541eedb..19b93c8cb2f9cee3991c7c3d5aa0fd92483e8e0d 100755 --- a/gridding_rccl.hip.cpp +++ b/gridding_rccl.hip.cpp @@ -101,14 +101,14 @@ void gridding_data(){ ncclCommInitRank(&comm, size, id, rank); - for (uint isector = 0; isector < nsectors; isector++) + for (myuint isector = 0; isector < nsectors; isector++) { double start = CPU_TIME_wt; - uint Nsec = histo_send[isector]; - uint Nweightss = Nsec*metaData.polarisations; - uint Nvissec = Nweightss*metaData.freq_per_chan; + myuint Nsec = histo_send[isector]; + myuint Nweightss = Nsec*metaData.polarisations; + myull Nvissec = Nweightss*metaData.freq_per_chan; double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); @@ -125,26 +125,26 @@ void gridding_data(){ // select data for this sector - uint icount = 0; - uint ip = 0; - uint inu = 0; + myuint icount = 0; + myuint ip = 0; + myuint inu = 0; #warning "this loop should be threaded" #warning "the counter of this loop should not be int" for(int iphi = histo_send[isector]-1; iphi>=0; iphi--) { - uint ilocal = sectorarray[isector][iphi]; + myuint ilocal = sectorarray[isector][iphi]; uus[icount] = data.uu[ilocal]; vvs[icount] = data.vv[ilocal]-isector*shift; wws[icount] = data.ww[ilocal]; - for (uint ipol=0; ipol<metaData.polarisations; ipol++) + for (myuint ipol=0; ipol<metaData.polarisations; ipol++) { weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol]; ip++; } - for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) + for (myuint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) { visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; @@ -167,7 +167,7 @@ void gridding_data(){ double my_vvmax = -1e20; #pragma omp for - for (uint ipart=0; ipart<Nsec; ipart++) + for (myuint ipart=0; ipart<Nsec; ipart++) { my_uumin = MIN(my_uumin, uus[ipart]); my_uumax = MAX(my_uumax, uus[ipart]); diff --git a/init.c b/init.c index 76566bbe6eaca15c81a274a5d3e9ca08d63f057f..79a7c5d514f43309d1b60c6ba2114ec32322713b 100755 --- a/init.c +++ b/init.c @@ -83,7 +83,15 @@ void op_filename() { strcpy(buf, num_buf); strcat(buf, outparam.fftfile); strcpy(out.fftfile, buf); -#ifdef WRITE_DATA +#ifdef WRITE_DATA + strcpy(buf, num_buf); + strcat(buf, outparam.gridded_writedata1); + strcpy(out.gridded_writedata1, buf); + + strcpy(buf, num_buf); + strcat(buf, outparam.gridded_writedata2); + strcpy(out.gridded_writedata2, buf); + strcpy(buf, num_buf); strcat(buf, outparam.fftfile_writedata1); strcpy(out.fftfile_writedata1, buf); @@ -213,6 +221,14 @@ void read_parameter_file(char *fname) strcpy(outparam.fftfile, buf2); } #ifdef WRITE_DATA + if(strcmp(buf1, "gridded_writedata1") == 0) + { + strcpy(outparam.gridded_writedata1, buf2); + } + if(strcmp(buf1, "gridded_writedata2") == 0) + { + strcpy(outparam.gridded_writedata2, buf2); + } if(strcmp(buf1, "fftfile_writedata1") == 0) { strcpy(outparam.fftfile_writedata1, buf2); @@ -334,15 +350,15 @@ void metaData_calculation() { } // Set temporary local size of points - uint nm_pe = (uint)(metaData.Nmeasures/size); - uint remaining = metaData.Nmeasures%size; + myuint nm_pe = (myuint)(metaData.Nmeasures/size); + myuint remaining = metaData.Nmeasures%size; startrow = rank*nm_pe; if (rank == size-1)nm_pe = nm_pe+remaining; - uint Nmeasures_tot = metaData.Nmeasures; + //myuint Nmeasures_tot = metaData.Nmeasures; metaData.Nmeasures = nm_pe; - unsigned long Nvis_tot = metaData.Nvis; + //unsigned long Nvis_tot = metaData.Nvis; metaData.Nvis = metaData.Nmeasures*metaData.freq_per_chan*metaData.polarisations; metaData.Nweights = metaData.Nmeasures*metaData.polarisations; diff --git a/main.c b/main.c index eece36980e15f57753c2fe9d4eaaba92c7d8634c..63540cd88be41a225d8779803b0c579119bd783d 100755 --- a/main.c +++ b/main.c @@ -107,8 +107,8 @@ int main(int argc, char * argv[]) char testfitsreal[NAME_LEN] = "parallel_real.fits"; char testfitsimag[NAME_LEN] = "parallel_img.fits"; - uint naxis = 2; - uint naxes[2] = { grid_size_x, grid_size_y }; + myuint naxis = 2; + myuint naxes[2] = { grid_size_x, grid_size_y }; #endif diff --git a/w-stacking.cu b/w-stacking.cu index 9aa0d6684489352a5f173330f08be64257ec39a2..d7f1c36690948a0103ef9251e2a24e97ce9475eb 100755 --- a/w-stacking.cu +++ b/w-stacking.cu @@ -103,9 +103,9 @@ void makeKaiserBesselKernel(double * kernel, __global__ void convolve_g( int num_w_planes, - uint num_points, - uint freq_per_chan, - uint polarizations, + myuint num_points, + myuint freq_per_chan, + myuint polarizations, double* uu, double* vv, double* ww, @@ -130,11 +130,11 @@ __global__ void convolve_g( { //printf("DENTRO AL KERNEL\n"); - uint gid = blockIdx.x*blockDim.x + threadIdx.x; + myuint gid = blockIdx.x*blockDim.x + threadIdx.x; if(gid < num_points) { - uint i = gid; - uint visindex = i*freq_per_chan*polarizations; + myuint i = gid; + unsigned long visindex = i*freq_per_chan*polarizations; double norm = std22/PI; int j, k; @@ -149,10 +149,10 @@ __global__ void convolve_g( int grid_v = (int)pos_v; // check the boundaries - uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; - uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; - uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; - uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; + myuint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; + myuint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; + myuint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; + myuint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; // Convolve this point onto the grid. @@ -165,7 +165,7 @@ __global__ void convolve_g( for (j = jmin; j <= jmax; j++) { double u_dist = (double)j+0.5 - pos_u; - uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); + myuint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen))); int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen))); @@ -182,11 +182,11 @@ __global__ void convolve_g( // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; - uint ifine = visindex; - for (uint ifreq=0; ifreq<freq_per_chan; ifreq++) + unsigned long ifine = visindex; + for (myuint ifreq=0; ifreq<freq_per_chan; ifreq++) { - uint iweight = visindex/freq_per_chan; - for (uint ipol=0; ipol<polarizations; ipol++) + myuint iweight = visindex/freq_per_chan; + for (myuint ipol=0; ipol<polarizations; ipol++) { double vistest = (double)vis_real[ifine]; if (!isnan(vistest)) @@ -209,10 +209,14 @@ __global__ void convolve_g( #pragma omp declare target #endif void wstack( +#ifdef __CUDACC__ long long unsigned num_w_planes, - uint num_points, - uint freq_per_chan, - uint polarizations, +#else + int num_w_planes, +#endif + myuint num_points, + myuint freq_per_chan, + myuint polarizations, double* uu, double* vv, double* ww, @@ -222,8 +226,13 @@ void wstack( double dx, double dw, int w_support, +#ifdef __CUDACC__ long long unsigned grid_size_x, long long unsigned grid_size_y, +#else + int grid_size_x, + int grid_size_y, +#endif double* grid, int num_threads, #ifdef NCCL_REDUCE @@ -234,9 +243,9 @@ void wstack( #endif ) { - uint i; - //uint index; - uint visindex; + myuint i; + //myuint index; + unsigned long visindex; // initialize the convolution kernel // gaussian: @@ -263,9 +272,9 @@ void wstack( #ifdef __CUDACC__ // Define the CUDA set up int Nth = NTHREADS; - uint Nbl = (uint)(num_points/Nth) + 1; + myuint Nbl = (myuint)(num_points/Nth) + 1; if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; - uint Nvis = num_points*freq_per_chan*polarizations; + unsigned long Nvis = num_points*freq_per_chan*polarizations; int ndevices; cudaGetDeviceCount(&ndevices); @@ -432,7 +441,7 @@ void wstack( #if defined(ACCOMP) && (GPU_STACKING) omp_set_default_device(rank % omp_get_num_devices()); - uint Nvis = num_points*freq_per_chan*polarizations; + myuint Nvis = num_points*freq_per_chan*polarizations; #pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom:grid[0:2*num_w_planes*grid_size_x*grid_size_y]) #else #pragma omp parallel for private(visindex) @@ -461,10 +470,10 @@ void wstack( int grid_v = (int)pos_v; // check the boundaries - uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; - uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; - uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; - uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; + myuint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; + myuint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; + myuint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; + myuint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax); @@ -479,7 +488,7 @@ void wstack( { double u_dist = (double)j+0.5 - pos_u; //double u_dist = (double)j - pos_u; - uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); + myuint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen))); int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen))); @@ -498,12 +507,12 @@ void wstack( // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; - uint ifine = visindex; + unsigned long ifine = visindex; // DAV: the following two loops are performend by each thread separately: no problems of race conditions - for (uint ifreq=0; ifreq<freq_per_chan; ifreq++) + for (myuint ifreq=0; ifreq<freq_per_chan; ifreq++) { - uint iweight = visindex/freq_per_chan; - for (uint ipol=0; ipol<polarizations; ipol++) + myuint iweight = visindex/freq_per_chan; + for (myuint ipol=0; ipol<polarizations; ipol++) { if (!isnan(vis_real[ifine])) { diff --git a/w-stacking.hip.cpp b/w-stacking.hip.cpp index 5c376f3c7d8a6566d84593bbeafe7dd0162edfe9..bc5b7301b7f039b1bf6495b08e535757a6d5204a 100755 --- a/w-stacking.hip.cpp +++ b/w-stacking.hip.cpp @@ -99,9 +99,9 @@ void makeKaiserBesselKernel(double * kernel, __global__ void convolve_g( int num_w_planes, - uint num_points, - uint freq_per_chan, - uint polarizations, + myuint num_points, + myuint freq_per_chan, + myuint polarizations, double* uu, double* vv, double* ww, @@ -126,11 +126,11 @@ __global__ void convolve_g( { //printf("DENTRO AL KERNEL\n"); - uint gid = blockIdx.x*blockDim.x + threadIdx.x; + myuint gid = blockIdx.x*blockDim.x + threadIdx.x; if(gid < num_points) { - uint i = gid; - uint visindex = i*freq_per_chan*polarizations; + myuint i = gid; + myuint visindex = i*freq_per_chan*polarizations; double norm = std22/PI; int j, k; @@ -145,10 +145,10 @@ __global__ void convolve_g( int grid_v = (int)pos_v; // check the boundaries - uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; - uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; - uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; - uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; + myuint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; + myuint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; + myuint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; + myuint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; // Convolve this point onto the grid. @@ -161,7 +161,7 @@ __global__ void convolve_g( for (j = jmin; j <= jmax; j++) { double u_dist = (double)j+0.5 - pos_u; - uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); + myuint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen))); int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen))); @@ -178,11 +178,11 @@ __global__ void convolve_g( // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; - uint ifine = visindex; - for (uint ifreq=0; ifreq<freq_per_chan; ifreq++) + myuint ifine = visindex; + for (myuint ifreq=0; ifreq<freq_per_chan; ifreq++) { - uint iweight = visindex/freq_per_chan; - for (uint ipol=0; ipol<polarizations; ipol++) + myuint iweight = visindex/freq_per_chan; + for (myuint ipol=0; ipol<polarizations; ipol++) { double vistest = (double)vis_real[ifine]; if (!isnan(vistest)) @@ -206,9 +206,9 @@ __global__ void convolve_g( #endif void wstack( int num_w_planes, - uint num_points, - uint freq_per_chan, - uint polarizations, + myuint num_points, + myuint freq_per_chan, + myuint polarizations, double* uu, double* vv, double* ww, @@ -231,9 +231,9 @@ void wstack( ) { - uint i; - //uint index; - uint visindex; + myuint i; + //myuint index; + myuint visindex; // initialize the convolution kernel // gaussian: @@ -260,9 +260,9 @@ void wstack( #ifdef __HIPCC__ // Define the HIP set up int Nth = NTHREADS; - uint Nbl = (uint)(num_points/Nth) + 1; + myuint Nbl = (myuint)(num_points/Nth) + 1; if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; - uint Nvis = num_points*freq_per_chan*polarizations; + myuint Nvis = num_points*freq_per_chan*polarizations; int ndevices; int num = hipGetDeviceCount(&ndevices); @@ -404,7 +404,7 @@ void wstack( #if defined(ACCOMP) && (GPU_STACKING) omp_set_default_device(rank % omp_get_num_devices()); - uint Nvis = num_points*freq_per_chan*polarizations; + myuint Nvis = num_points*freq_per_chan*polarizations; #pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom:grid[0:2*num_w_planes*grid_size_x*grid_size_y]) #else #pragma omp parallel for private(visindex) @@ -433,10 +433,10 @@ void wstack( int grid_v = (int)pos_v; // check the boundaries - uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; - uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; - uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; - uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; + myuint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; + myuint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; + myuint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; + myuint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax); @@ -451,7 +451,7 @@ void wstack( { //double u_dist = (double)j+0.5 - pos_u; double u_dist = (double)j - pos_u; - uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); + myuint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen))); int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen))); @@ -470,12 +470,12 @@ void wstack( // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; - uint ifine = visindex; + myuint ifine = visindex; // DAV: the following two loops are performend by each thread separately: no problems of race conditions - for (uint ifreq=0; ifreq<freq_per_chan; ifreq++) + for (myuint ifreq=0; ifreq<freq_per_chan; ifreq++) { - uint iweight = visindex/freq_per_chan; - for (uint ipol=0; ipol<polarizations; ipol++) + myuint iweight = visindex/freq_per_chan; + for (myuint ipol=0; ipol<polarizations; ipol++) { if (!isnan(vis_real[ifine])) {