diff --git a/allvars.c b/allvars.c index 58a1c6b958b4ecf9d2a54bdd0e6b808208dceb37..50de943827fbba9fe9f5ef73eb05ac6440db6159 100644 --- a/allvars.c +++ b/allvars.c @@ -7,7 +7,9 @@ struct ip in; struct op out, outparam; struct meta metaData; -struct time timing; +timing_t wt_timing = {0}; +timing_t pr_timing = {0}; + struct parameter param; struct fileData data; @@ -16,6 +18,7 @@ char datapath[900]; int xaxis, yaxis; int global_rank; int size; +int verbose_level = 0; long nsectors; long startrow; double resolution, dx, dw, w_supporth; diff --git a/allvars.h b/allvars.h index 7792f1d2ed260c28f35b9ff3f84abe569e6a5eb6..685f00325645393223e31c65c47a1f54556a3b2e 100644 --- a/allvars.h +++ b/allvars.h @@ -92,13 +92,24 @@ extern struct meta } metaData; -extern struct time -{ - double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time; - double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1; - double writetime, writetime1; - -} timing; +typedef struct { + double setup; // time spent in initialization, init() + double init; // time spent in initializing arrays + double process; // time spent in gridding; + double mpi; // time spent in mpi communications + double fftw; // + double kernel; // + double mmove; // + double reduce; // + double reduce_mpi; // + double reduce_sh ; // + double compose; // + double phase; // + double write; // + double total; } timing_t; + +extern timing_t wt_timing; // wall-clock timings +extern timing_t pr_timing; // process CPU timing extern struct parameter { @@ -127,12 +138,11 @@ extern char datapath[900]; extern int xaxis, yaxis; extern int global_rank; extern int size; +extern int verbose_level; extern long nsectors; extern long startrow; extern double resolution, dx, dw, w_supporth; -extern clock_t start, end, start0, startk, endk; -extern struct timespec begin, finish, begin0, begink, finishk; extern long * histo_send, size_of_grid; extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w; @@ -142,3 +152,60 @@ extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w; #endif extern long **sectorarray; + + +#if defined(DEBUG) +#define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) && \ + ( ((t) ==-1 ) || ((T)==(t)) ) ) { \ + printf(__VA_ARGS__); fflush(stdout); } + +#else +#define dprintf(...) +#endif + + +#define CPU_TIME_wt ({ struct timespec myts; (clock_gettime( CLOCK_REALTIME, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);}) +#define CPU_TIME_pr ({ struct timespec myts; (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);}) +#define CPU_TIME_th ({ struct timespec myts; (clock_gettime( CLOCK_THREAD_CPUTIME_ID, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);}) + + +#if defined(_OPENMP) +#define TAKE_TIME_START( T ) { \ + wt_timing.T = CPU_TIME_wt; \ + pr_timing.T = CPU_TIME_pr; } + +#define TAKE_TIME_STOP( T ) { \ + pr_timing.T = CPU_TIME_pr - pr_timing.T; \ + wt_timing.T = CPU_TIME_wt - wt_timing.T; } + +#define TAKE_TIME( Twt, Tpr ) { Twt = CPU_TIME_wt; Tpr = CPU_TIME_pr; } +#define ADD_TIME( T, Twt, Tpr ) { \ + pr_timing.T += CPU_TIME_pr - Tpr; \ + wt_timing.T += CPU_TIME_wt - Twt; \ + Tpr = CPU_TIME_pr; Twt = CPU_TIME_wt; } + +#else + +#define TAKE_TIME_START( T ) wt_timing.T = CPU_TIME_wt + +#define TAKE_TIME_STOP( T ) wt_timing.T = CPU_TIME_wt - wt_timing.T + +#define TAKE_TIME( Twt, ... ) Twt = CPU_TIME_wt; +#define ADD_TIME( T, Twt, ... ) { wt_timing.T += CPU_TIME_wt - Twt; Twt = CPU_TIME_wt;} + +#endif + +#define TAKE_TIMEwt_START( T) wt_timing.T = CPU_TIME_wt +#define TAKE_TIMEwt_STOP( T) wt_timing.T = CPU_TIME_wt - wt_timing.T +#define TAKE_TIMEwt( Twt ) Twt = CPU_TIME_wt; +#define ADD_TIMEwt( T, Twt ) { wt_timing.T += CPU_TIME_wt - Twt; Twt = CPU_TIME_wt; } + + +#if defined(__GNUC__) && !defined(__ICC) && !defined(__INTEL_COMPILER) +#define PRAGMA_IVDEP _Pragma("GCC ivdep") +#else +#define PRAGMA_IVDEP _Pragma("ivdep") +#endif + +#define STRINGIFY(a) #a +#define UNROLL(N) _Pragma(STRINGIFY(unroll(N))) diff --git a/data/paramfile.txt b/data/paramfile.txt index e8a2156f3bdbf9f7ca792a22014fd2c64bebd1b5..2146b3bdb6c5c03a4b6ab59acb568302a96ea55a 100644 --- a/data/paramfile.txt +++ b/data/paramfile.txt @@ -1,7 +1,7 @@ ndatasets 1 -Datapath1 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_121MHz.pre-cal.binMS/ -Datapath2 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_123MHz.pre-cal.binMS/ -Datapath3 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_125MHz.pre-cal.binMS/ +Datapath1 ../input/ +Datapath2 ../input/ +Datapath3 ../input/ num_threads 2 w_support 7 grid_size_x 2048 @@ -23,4 +23,5 @@ fftfile2 fft_real.bin fftfile3 fft_img.bin logfile run.log extension .txt -timingfile timings.dat +timingfile timings.dat +verbose_level 1 diff --git a/fourier_transform.c b/fourier_transform.c index 81d7fd8de9189f5267791a9f5cefcd8931016b7b..fda11fe591406919707b1dda220d8b6e3c0a764c 100644 --- a/fourier_transform.c +++ b/fourier_transform.c @@ -2,213 +2,219 @@ #include "allvars.h" #include "proto.h" -void fftw_data(){ - - #ifdef USE_FFTW - // FFT transform the data (using distributed FFTW) - if(global_rank == 0)printf("PERFORMING FFT\n"); - clock_gettime(CLOCK_MONOTONIC, &begin); - start = clock(); - fftw_plan plan; - fftw_complex *fftwgrid; - ptrdiff_t alloc_local, local_n0, local_0_start; - double norm = 1.0/(double)(param.grid_size_x*param.grid_size_y); - - // map the 1D array of complex visibilities to a 2D array required by FFTW (complex[*][2]) - // x is the direction of contiguous data and maps to the second parameter - // y is the parallelized direction and corresponds to the first parameter (--> n0) - // and perform the FFT per w plane - alloc_local = fftw_mpi_local_size_2d(param.grid_size_y, param.grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start); - fftwgrid = fftw_alloc_complex(alloc_local); - plan = fftw_mpi_plan_dft_2d(param.grid_size_y, param.grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); - - long fftwindex = 0; - long fftwindex2D = 0; - for (int iw=0; iw<param.num_w_planes; iw++) - { - //printf("FFTing plan %d\n",iw); - //select the w-plane to transform - for (int iv=0; iv<yaxis; iv++) - { - for (int iu=0; iu<xaxis; iu++) - { - fftwindex2D = iu + iv*xaxis; - fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); - fftwgrid[fftwindex2D][0] = grid[fftwindex]; - fftwgrid[fftwindex2D][1] = grid[fftwindex+1]; - } - } - - // do the transform for each w-plane - fftw_execute(plan); - - // save the transformed w-plane - for (int iv=0; iv<yaxis; iv++) - { - for (int iu=0; iu<xaxis; iu++) - { - fftwindex2D = iu + iv*xaxis; - fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); - gridss[fftwindex] = norm*fftwgrid[fftwindex2D][0]; - gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D][1]; - } - } - - } - - fftw_destroy_plan(plan); - fftw_free(fftwgrid); - - #ifdef USE_MPI - MPI_Win_fence(0,slabwin); - MPI_Barrier(MPI_COMM_WORLD); - #endif - - end = clock(); - clock_gettime(CLOCK_MONOTONIC, &finish); - timing.fftw_time = ((double) (end - start)) / CLOCKS_PER_SEC; - timing.fftw_time1 = (finish.tv_sec - begin.tv_sec); - timing.fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; - clock_gettime(CLOCK_MONOTONIC, &begin); - - #endif +void fftw_data() +{ + + #ifdef USE_FFTW + // FFT transform the data (using distributed FFTW) + if(global_rank == 0)printf("PERFORMING FFT\n"); + + TAKE_TIME_START(fftw); + + fftw_plan plan; + fftw_complex *fftwgrid; + ptrdiff_t alloc_local, local_n0, local_0_start; + double norm = 1.0/(double)(param.grid_size_x*param.grid_size_y); + + // map the 1D array of complex visibilities to a 2D array required by FFTW (complex[*][2]) + // x is the direction of contiguous data and maps to the second parameter + // y is the parallelized direction and corresponds to the first parameter (--> n0) + // and perform the FFT per w plane + alloc_local = fftw_mpi_local_size_2d(param.grid_size_y, param.grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start); + fftwgrid = fftw_alloc_complex(alloc_local); + plan = fftw_mpi_plan_dft_2d(param.grid_size_y, param.grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); + + long fftwindex = 0; + long fftwindex2D = 0; + for (int iw=0; iw<param.num_w_planes; iw++) + { + //printf("FFTing plan %d\n",iw); + //select the w-plane to transform + for (int iv=0; iv<yaxis; iv++) + { + for (int iu=0; iu<xaxis; iu++) + { + fftwindex2D = iu + iv*xaxis; + fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); + fftwgrid[fftwindex2D][0] = grid[fftwindex]; + fftwgrid[fftwindex2D][1] = grid[fftwindex+1]; + } + } + + // do the transform for each w-plane + fftw_execute(plan); + + // save the transformed w-plane + for (int iv=0; iv<yaxis; iv++) + { + for (int iu=0; iu<xaxis; iu++) + { + fftwindex2D = iu + iv*xaxis; + fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); + gridss[fftwindex] = norm*fftwgrid[fftwindex2D][0]; + gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D][1]; + } + } + + } + + fftw_destroy_plan(plan); + fftw_free(fftwgrid); + + #ifdef USE_MPI + MPI_Win_fence(0,slabwin); + MPI_Barrier(MPI_COMM_WORLD); + #endif + + TAKE_TIME_STOP(fftw); + + #endif } void write_fftw_data(){ - #ifdef USE_FFTW - #ifdef WRITE_DATA - // Write results - #ifdef USE_MPI - MPI_Win writewin; - MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin); - MPI_Win_fence(0,writewin); - #endif - if (global_rank == 0) - { - printf("WRITING FFT TRANSFORMED DATA\n"); - file.pFilereal = fopen (out.fftfile2,"wb"); - file.pFileimg = fopen (out.fftfile3,"wb"); - #ifdef USE_MPI - for (int isector=0; isector<nsectors; isector++) - { - MPI_Win_lock(MPI_LOCK_SHARED,isector,0,writewin); - MPI_Get(gridss_w,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,writewin); - MPI_Win_unlock(isector,writewin); - for (long i=0; i<size_of_grid/2; i++) - { - gridss_real[i] = gridss_w[2*i]; - gridss_img[i] = gridss_w[2*i+1]; - } - if (param.num_w_planes > 1) - { - for (int iw=0; iw<param.num_w_planes; iw++) + // Write results + + #ifdef USE_FFTW + + double twt, tpr; + + #ifdef WRITE_DATA + + TAKE_TIME(twt, tpr); + + #ifdef USE_MPI + MPI_Win writewin; + MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin); + MPI_Win_fence(0,writewin); + #endif + if (global_rank == 0) + { + printf("WRITING FFT TRANSFORMED DATA\n"); + file.pFilereal = fopen (out.fftfile2,"wb"); + file.pFileimg = fopen (out.fftfile3,"wb"); + #ifdef USE_MPI + for (int isector=0; isector<nsectors; isector++) + { + MPI_Win_lock(MPI_LOCK_SHARED,isector,0,writewin); + MPI_Get(gridss_w,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,writewin); + MPI_Win_unlock(isector,writewin); + for (long i=0; i<size_of_grid/2; i++) + { + gridss_real[i] = gridss_w[2*i]; + gridss_img[i] = gridss_w[2*i+1]; + } + if (param.num_w_planes > 1) + { + for (int iw=0; iw<param.num_w_planes; iw++) for (int iv=0; iv<yaxis; iv++) - for (int iu=0; iu<xaxis; iu++) - { - long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); - long index = iu + iv*xaxis + iw*xaxis*yaxis; - fseek(file.pFilereal, global_index, SEEK_SET); - fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal); - } - for (int iw=0; iw<param.num_w_planes; iw++) + for (int iu=0; iu<xaxis; iu++) + { + long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); + long index = iu + iv*xaxis + iw*xaxis*yaxis; + fseek(file.pFilereal, global_index, SEEK_SET); + fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal); + } + for (int iw=0; iw<param.num_w_planes; iw++) for (int iv=0; iv<yaxis; iv++) - for (int iu=0; iu<xaxis; iu++) - { - long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); - long index = iu + iv*xaxis + iw*xaxis*yaxis; - fseek(file.pFileimg, global_index, SEEK_SET); - fwrite(&gridss_img[index], 1, sizeof(double), file.pFileimg); - } - } - else - { - fwrite(gridss_real, size_of_grid/2, sizeof(double), file.pFilereal); - fwrite(gridss_img, size_of_grid/2, sizeof(double), file.pFileimg); - } - - } - #else - /* - for (int iw=0; iw<param.num_w_planes; iw++) - for (int iv=0; iv<grid_size_y; iv++) - for (int iu=0; iu<grid_size_x; iu++) - { - int isector = 0; - long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y); - double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]); - fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm); - } - */ - #endif - - fclose(file.pFilereal); - fclose(file.pFileimg); - } - #ifdef USE_MPI - MPI_Win_fence(0,writewin); - MPI_Win_free(&writewin); - MPI_Barrier(MPI_COMM_WORLD); - #endif - #endif //WRITE_DATA - - - // Phase correction - - clock_gettime(CLOCK_MONOTONIC, &begin); - start = clock(); - if(global_rank == 0)printf("PHASE CORRECTION\n"); - double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double)); - double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double)); - - phase_correction(gridss,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads); - - end = clock(); - clock_gettime(CLOCK_MONOTONIC, &finish); - timing.phase_time = ((double) (end - start)) / CLOCKS_PER_SEC; - timing.phase_time1 = (finish.tv_sec - begin.tv_sec); - timing.phase_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; - - #ifdef WRITE_IMAGE - - if(global_rank == 0) - { - file.pFilereal = fopen (out.fftfile2,"wb"); - file.pFileimg = fopen (out.fftfile3,"wb"); - fclose(file.pFilereal); - fclose(file.pFileimg); - } - #ifdef USE_MPI - MPI_Barrier(MPI_COMM_WORLD); - #endif - if(global_rank == 0)printf("WRITING IMAGE\n"); - for (int isector=0; isector<size; isector++) - { - #ifdef USE_MPI - MPI_Barrier(MPI_COMM_WORLD); - #endif - if(isector == global_rank) - { - printf("%d writing\n",isector); - file.pFilereal = fopen (out.fftfile2,"ab"); - file.pFileimg = fopen (out.fftfile3,"ab"); - - long global_index = isector*(xaxis*yaxis)*sizeof(double); - - fseek(file.pFilereal, global_index, SEEK_SET); - fwrite(image_real, xaxis*yaxis, sizeof(double), file.pFilereal); - fseek(file.pFileimg, global_index, SEEK_SET); - fwrite(image_imag, xaxis*yaxis, sizeof(double), file.pFileimg); - - fclose(file.pFilereal); - fclose(file.pFileimg); - } - } - #ifdef USE_MPI - MPI_Barrier(MPI_COMM_WORLD); - #endif - - #endif //WRITE_IMAGE - - #endif //FFTW + for (int iu=0; iu<xaxis; iu++) + { + long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); + long index = iu + iv*xaxis + iw*xaxis*yaxis; + fseek(file.pFileimg, global_index, SEEK_SET); + fwrite(&gridss_img[index], 1, sizeof(double), file.pFileimg); + } + } + else + { + fwrite(gridss_real, size_of_grid/2, sizeof(double), file.pFilereal); + fwrite(gridss_img, size_of_grid/2, sizeof(double), file.pFileimg); + } + + } + #else + /* + for (int iw=0; iw<param.num_w_planes; iw++) + for (int iv=0; iv<grid_size_y; iv++) + for (int iu=0; iu<grid_size_x; iu++) + { + int isector = 0; + long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y); + double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]); + fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm); + } + */ + #endif + + fclose(file.pFilereal); + fclose(file.pFileimg); + } + #ifdef USE_MPI + MPI_Win_fence(0,writewin); + MPI_Win_free(&writewin); + MPI_Barrier(MPI_COMM_WORLD); + #endif + + ADD_TIME(write, twt, tpr); + #endif //WRITE_DATA + + + // Phase correction + + + + TAKE_TIME_START(phase) + if(global_rank == 0)printf("PHASE CORRECTION\n"); + double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double)); + double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double)); + + phase_correction(gridss,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads); + + TAKE_TIME_STOP(phase); + + #ifdef WRITE_IMAGE + + TAKE_TIME(twt, tpr); + + if(global_rank == 0) + { + file.pFilereal = fopen (out.fftfile2,"wb"); + file.pFileimg = fopen (out.fftfile3,"wb"); + fclose(file.pFilereal); + fclose(file.pFileimg); + } + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif + if(global_rank == 0)printf("WRITING IMAGE\n"); + for (int isector=0; isector<size; isector++) + { + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif + if(isector == global_rank) + { + printf("%d writing\n",isector); + file.pFilereal = fopen (out.fftfile2,"ab"); + file.pFileimg = fopen (out.fftfile3,"ab"); + + long global_index = isector*(xaxis*yaxis)*sizeof(double); + + fseek(file.pFilereal, global_index, SEEK_SET); + fwrite(image_real, xaxis*yaxis, sizeof(double), file.pFilereal); + fseek(file.pFileimg, global_index, SEEK_SET); + fwrite(image_imag, xaxis*yaxis, sizeof(double), file.pFileimg); + + fclose(file.pFilereal); + fclose(file.pFileimg); + } + } + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif + + ADD_TIME(write, twt, tpr); + #endif //WRITE_IMAGE + + #endif //FFTW } diff --git a/gridding.c b/gridding.c index 758317749388280f2f09acd60aee4924b3998c09..afe7092e6bcf733289898826e9ba6f6bbf72ff1a 100644 --- a/gridding.c +++ b/gridding.c @@ -4,209 +4,209 @@ #include "proto.h" -void gridding(){ - - if(global_rank == 0)printf("GRIDDING DATA\n"); - - // Create histograms and linked lists - - clock_gettime(CLOCK_MONOTONIC, &begin); - start = clock(); +void gridding() +{ + + if(global_rank == 0)printf("GRIDDING DATA\n"); + + // Create histograms and linked lists - // Initialize linked list - initialize_array(); + TAKE_TIME_START(init); + + // Initialize linked list + initialize_array(); - //Sector and Gridding data - gridding_data(); + TAKE_TIME_STOP(init); + TAKE_TIME_START(process); - #ifdef USE_MPI - MPI_Barrier(MPI_COMM_WORLD); - #endif + // Sector and Gridding data + gridding_data(); - end = clock(); - clock_gettime(CLOCK_MONOTONIC, &finish); - timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC; - timing.process_time1 = (finish.tv_sec - begin.tv_sec); - timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; - clock_gettime(CLOCK_MONOTONIC, &begin); + TAKE_TIME_STOP(process); + + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif + return; } -void initialize_array(){ +void initialize_array() +{ - histo_send = (long*) calloc(nsectors+1,sizeof(long)); - int * boundary = (int*) calloc(metaData.Nmeasures,sizeof(int)); - double vvh; + histo_send = (long*) calloc(nsectors+1,sizeof(long)); + int * boundary = (int*) calloc(metaData.Nmeasures,sizeof(int)); + double vvh; - for (long iphi = 0; iphi < metaData.Nmeasures; iphi++) - { - boundary[iphi] = -1; - vvh = data.vv[iphi]; //less or equal to 0.6 - int binphi = (int)(vvh*nsectors); //has values expect 0 and nsectors-1. So we use updist and downdist condition - // check if the point influence also neighboring slabs - double updist = (double)((binphi+1)*yaxis)*dx - vvh; - double downdist = vvh - (double)(binphi*yaxis)*dx; - // - histo_send[binphi]++; - if(updist < w_supporth && updist >= 0.0) - {histo_send[binphi+1]++; boundary[iphi] = binphi+1;}; - if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) - {histo_send[binphi-1]++; boundary[iphi] = binphi-1;}; - } + for (long iphi = 0; iphi < metaData.Nmeasures; iphi++) + { + boundary[iphi] = -1; + vvh = data.vv[iphi]; //less or equal to 0.6 + int binphi = (int)(vvh*nsectors); //has values expect 0 and nsectors-1. So we use updist and downdist condition + // check if the point influence also neighboring slabs + double updist = (double)((binphi+1)*yaxis)*dx - vvh; + double downdist = vvh - (double)(binphi*yaxis)*dx; + // + histo_send[binphi]++; + if(updist < w_supporth && updist >= 0.0) + {histo_send[binphi+1]++; boundary[iphi] = binphi+1;}; + if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) + {histo_send[binphi-1]++; boundary[iphi] = binphi-1;}; + } - sectorarray = (long**)malloc ((nsectors+1) * sizeof(long*)); - for(int sec=0; sec<(nsectors+1); sec++) - { - sectorarray[sec] = (long*)malloc(histo_send[sec]*sizeof(long)); - } + sectorarray = (long**)malloc ((nsectors+1) * sizeof(long*)); + for(int sec=0; sec<(nsectors+1); sec++) + { + sectorarray[sec] = (long*)malloc(histo_send[sec]*sizeof(long)); + } - long *counter = (long*) calloc(nsectors+1,sizeof(long)); - for (long iphi = 0; iphi < metaData.Nmeasures; iphi++) - { - vvh = data.vv[iphi]; - int binphi = (int)(vvh*nsectors); - double updist = (double)((binphi+1)*yaxis)*dx - vvh; - double downdist = vvh - (double)(binphi*yaxis)*dx; - sectorarray[binphi][counter[binphi]] = iphi; - counter[binphi]++; - if(updist < w_supporth && updist >= 0.0) - { sectorarray[binphi+1][counter[binphi+1]] = iphi; counter[binphi+1]++;}; - if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) - { sectorarray[binphi-1][counter[binphi-1]] = iphi; counter[binphi-1]++;}; - } + long *counter = (long*) calloc(nsectors+1,sizeof(long)); + for (long iphi = 0; iphi < metaData.Nmeasures; iphi++) + { + vvh = data.vv[iphi]; + int binphi = (int)(vvh*nsectors); + double updist = (double)((binphi+1)*yaxis)*dx - vvh; + double downdist = vvh - (double)(binphi*yaxis)*dx; + sectorarray[binphi][counter[binphi]] = iphi; + counter[binphi]++; + if(updist < w_supporth && updist >= 0.0) + { sectorarray[binphi+1][counter[binphi+1]] = iphi; counter[binphi+1]++;}; + if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) + { sectorarray[binphi-1][counter[binphi-1]] = iphi; counter[binphi-1]++;}; + } - #ifdef VERBOSE - for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n", global_rank, iii, histo_send[iii]); - #endif -} + #ifdef VERBOSE + for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n", global_rank, iii, histo_send[iii]); + #endif -void gridding_data(){ - - double shift = (double)(dx*yaxis); - - // Open the MPI Memory Window for the slab - #ifdef USE_MPI - MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin); - MPI_Win_fence(0,slabwin); - #endif - - #ifndef USE_MPI - file.pFile1 = fopen (out.outfile1,"w"); - #endif + free( boundary); + return; +} - timing.kernel_time = 0.0; - timing.kernel_time1 = 0.0; - timing.reduce_time = 0.0; - timing.reduce_time1 = 0.0; - timing.compose_time = 0.0; - timing.compose_time1 = 0.0; +void gridding_data() +{ - // calculate the resolution in radians - resolution = 1.0/MAX(abs(metaData.uvmin),abs(metaData.uvmax)); - - // calculate the resolution in arcsec - double resolution_asec = (3600.0*180.0)/MAX(abs(metaData.uvmin),abs(metaData.uvmax))/PI; + double shift = (double)(dx*yaxis); + + #ifdef USE_MPI + MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin); + MPI_Win_fence(0,slabwin); + #endif + + #ifndef USE_MPI + file.pFile1 = fopen (out.outfile1,"w"); + #endif + + // calculate the resolution in radians + resolution = 1.0/MAX(abs(metaData.uvmin),abs(metaData.uvmax)); + + // calculate the resolution in arcsec + double resolution_asec = (3600.0*180.0)/MAX(abs(metaData.uvmin),abs(metaData.uvmax))/PI; + if( global_rank == 0 ) printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); - for (long isector = 0; isector < nsectors; isector++) + for (long isector = 0; isector < nsectors; isector++) { - clock_gettime(CLOCK_MONOTONIC, &begink); - startk = clock(); - // define local destination sector - //isector = (isector_count+rank)%size; // this line must be wrong! [LT] - - // allocate sector arrays - long Nsec = histo_send[isector]; - double *uus = (double*) malloc(Nsec*sizeof(double)); - double *vvs = (double*) malloc(Nsec*sizeof(double)); - double *wws = (double*) malloc(Nsec*sizeof(double)); - long Nweightss = Nsec*metaData.polarisations; - long Nvissec = Nweightss*metaData.freq_per_chan; - float *weightss = (float*) malloc(Nweightss*sizeof(float)); - float *visreals = (float*) malloc(Nvissec*sizeof(float)); - float *visimgs = (float*) malloc(Nvissec*sizeof(float)); + double twt, tpr; + + TAKE_TIME(twt, tpr); + + // define local destination sector + //isector = (isector_count+rank)%size; // this line must be wrong! [LT] + + // allocate sector arrays + long Nsec = histo_send[isector]; + double *uus = (double*) malloc(Nsec*sizeof(double)); + double *vvs = (double*) malloc(Nsec*sizeof(double)); + double *wws = (double*) malloc(Nsec*sizeof(double)); + long Nweightss = Nsec*metaData.polarisations; + long Nvissec = Nweightss*metaData.freq_per_chan; + float *weightss = (float*) malloc(Nweightss*sizeof(float)); + float *visreals = (float*) malloc(Nvissec*sizeof(float)); + float *visimgs = (float*) malloc(Nvissec*sizeof(float)); - // select data for this sector - long icount = 0; - long ip = 0; - long inu = 0; + // select data for this sector + long icount = 0; + long ip = 0; + long inu = 0; - for(long iphi = histo_send[isector]-1; iphi>=0; iphi--) + #warning shall we omp-ize this ? + for(long iphi = histo_send[isector]-1; iphi>=0; iphi--) { - long ilocal = sectorarray[isector][iphi]; - //double vvh = data.vv[ilocal]; - //int binphi = (int)(vvh*nsectors); - //if (binphi == isector || boundary[ilocal] == isector) { - uus[icount] = data.uu[ilocal]; - vvs[icount] = data.vv[ilocal]-isector*shift; - wws[icount] = data.ww[ilocal]; - for (long ipol=0; ipol<metaData.polarisations; ipol++) - { - weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol]; - ip++; - } - for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) - { - visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; - visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; - //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis); - inu++; - } - icount++; - } - - clock_gettime(CLOCK_MONOTONIC, &finishk); - endk = clock(); - timing.compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; - timing.compose_time1 += (finishk.tv_sec - begink.tv_sec); - timing.compose_time1 += (finishk.tv_sec - begink.tv_sec); - timing.compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; - - #ifndef USE_MPI - double vvmin = 1e20; - double uumax = -1e20; - double vvmax = -1e20; - - for (long ipart=0; ipart<Nsec; ipart++) - { - uumin = MIN(uumin,uus[ipart]); - uumax = MAX(uumax,uus[ipart]); - vvmin = MIN(vvmin,vvs[ipart]); - vvmax = MAX(vvmax,vvs[ipart]); + long ilocal = sectorarray[isector][iphi]; + //double vvh = data.vv[ilocal]; + //int binphi = (int)(vvh*nsectors); + //if (binphi == isector || boundary[ilocal] == isector) { + uus[icount] = data.uu[ilocal]; + vvs[icount] = data.vv[ilocal]-isector*shift; + wws[icount] = data.ww[ilocal]; + UNROLL(4) + PRAGMA_IVDEP + for (long ipol=0; ipol<metaData.polarisations; ipol++, ip++) + { + weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol]; + } + + PRAGMA_IVDEP + UNROLL(4) + for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++, inu++) + { + visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; + visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; + //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis); + } + icount++; + } + + ADD_TIME(compose, twt, tpr); + + #ifndef USE_MPI + double vvmin = 1e20; + double uumax = -1e20; + double vvmax = -1e20; + + #warning shall we omp-ize this ? + for (long ipart=0; ipart<Nsec; ipart++) + { + uumin = MIN(uumin,uus[ipart]); + uumax = MAX(uumax,uus[ipart]); + vvmin = MIN(vvmin,vvs[ipart]); + vvmax = MAX(vvmax,vvs[ipart]); - if(ipart%10 == 0)fprintf (file.pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]); - } + if(ipart%10 == 0)fprintf (file.pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]); + } - printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); - #endif - - // Make convolution on the grid + printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); + #endif - #ifdef VERBOSE - printf("Processing sector %ld\n",isector); - #endif - clock_gettime(CLOCK_MONOTONIC, &begink); - startk = clock(); - - wstack(param.num_w_planes, - Nsec, - metaData.freq_per_chan, - metaData.polarisations, - uus, - vvs, - wws, - visreals, - visimgs, - weightss, - dx, - dw, - param.w_support, - xaxis, - yaxis, - gridss, - param.num_threads); + // Make convolution on the grid + #ifdef VERBOSE + printf("Processing sector %ld\n",isector); + #endif + TAKE_TIME(twt, tpr); + + wstack(param.num_w_planes, + Nsec, + metaData.freq_per_chan, + metaData.polarisations, + uus, + vvs, + wws, + visreals, + visimgs, + weightss, + dx, + dw, + param.w_support, + xaxis, + yaxis, + gridss, + param.num_threads); + + ADD_TIME(kernel, twt, tpr); + /* int z =0 ; * #pragma omp target map(to:test_i_gpu) map(from:z) * { @@ -215,134 +215,144 @@ void gridding_data(){ * z = x + test_i_gpu; * }*/ - clock_gettime(CLOCK_MONOTONIC, &finishk); - endk = clock(); - timing.kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; - timing.kernel_time1 += (finishk.tv_sec - begink.tv_sec); - timing.kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; - #ifdef VERBOSE - printf("Processed sector %ld\n",isector); - #endif - clock_gettime(CLOCK_MONOTONIC, &begink); - startk = clock(); + #ifdef VERBOSE + printf("Processed sector %ld\n",isector); + #endif + /* ---------------- + * REDUCE + * ---------------- */ - //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]); - - #ifndef USE_MPI - long stride = isector*2*xaxis*yaxis*num_w_planes; - for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++) - gridtot[stride+iii] = gridss[iii]; - #endif + double twt_r, tpr_r; + TAKE_TIME(twt_r, tpr_r); - // Write grid in the corresponding remote slab - #ifdef USE_MPI - // int target_rank = (int)isector; it implied that size >= nsectors - int target_rank = (int)(isector % size); - - #ifdef ONE_SIDE + // .................. + #ifndef USE_MPI // REDUCE WITH NO MPI + + #pragma omp parallel + { + long stride = isector * size_of_grid; + #pragma omp for + for (long iii=0; iii< size_fo_grid; iii++) + gridtot[stride+iii] = gridss[iii]; + } - // MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin); - // MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); - // MPI_Win_unlock(target_rank,slabwin); - - // for every task, gridss coincides with the - // that can be avoided if shared window coincides with gridss - memcpy(Me.win.ptr+isector*size_of_grid, gridss, size_of_grid*sizeof(double)); + // .................. + // REDUCE WITH MPI + #else + // Write grid in the corresponding remote slab + // int target_rank = (int)isector; it implied that size >= nsectors + int target_rank = (int)(isector % size); - reduce( isector, target_rank ); // here the reduce is performed within every host + #ifdef ONE_SIDE - if ( Me.Nhosts > 1 ) - { - // here the reduce is performed among hosts - MPI_Barrier(MPI_COMM_WORLD); + // for every task, gridss coincides with the + // that can be avoided if shared window coincides with gridss + + TAKE_TIME(twt, tpr); + memcpy(Me.win.ptr+isector*size_of_grid, gridss, size_of_grid*sizeof(double)); + ADD_TIME(mmove, twt, tpr); + + dprintf(1, global_rank, 0, "reducing sector %ld..\n", isector); + + TAKE_TIME( twt, tpr); + reduce( isector, target_rank ); // here the reduce is performed within every host + ADD_TIME(reduce_sh, twt, tpr); + + if ( Me.Nhosts > 1 ) + { + // here the reduce is performed among hosts + MPI_Barrier(MPI_COMM_WORLD); - int Im_in_the_new_communicator = MPI_UNDEFINED; - if(global_rank == target_rank) - Im_in_the_new_communicator = 1; - else - if( Me.Rank[HOSTS] == 0 ) - { - if( Me.Ranks_to_host[ target_rank ] != Me.myhost ) - Im_in_the_new_communicator = 1; - } + int Im_in_the_new_communicator = MPI_UNDEFINED; + if(global_rank == target_rank) + Im_in_the_new_communicator = 1; + else + if( Me.Rank[HOSTS] == 0 ) + { + if( Me.Ranks_to_host[ target_rank ] != Me.myhost ) + Im_in_the_new_communicator = 1; + } - MPI_Comm Sector_Comm; - MPI_Comm_split( COMM[WORLD], Im_in_the_new_communicator, global_rank, &Sector_Comm); + MPI_Comm Sector_Comm; + MPI_Comm_split( COMM[WORLD], Im_in_the_new_communicator, global_rank, &Sector_Comm); - if( Sector_Comm != MPI_COMM_NULL ) - { - int sector_size; - int sector_rank = 0; - int sector_target; + if( Sector_Comm != MPI_COMM_NULL ) + { + double _twt_; + int sector_size; + int sector_rank = 0; + int sector_target; - MPI_Comm_size( Sector_Comm, §or_size); - MPI_Comm_rank( Sector_Comm, §or_rank); - if ( global_rank == target_rank) - { - MPI_Send( §or_rank, 1, MPI_INT, 0, 0, Sector_Comm); - memcpy(grid, Me.swins[Me.Rank[myHOST]].ptr+isector*size_of_grid*sizeof(double), size_of_grid * sizeof(double)); - } - - if( sector_rank == 0 ) - { - MPI_Status status; - MPI_Recv( §or_target, 1, MPI_INT, MPI_ANY_SOURCE, 0, Sector_Comm, &status); - } - - MPI_Bcast( §or_target, 1, MPI_INT, 0, Sector_Comm ); - - MPI_Reduce(grid, grid, size_of_grid, MPI_DOUBLE,MPI_SUM, sector_target, Sector_Comm); + MPI_Comm_size( Sector_Comm, §or_size); + MPI_Comm_rank( Sector_Comm, §or_rank); + if ( global_rank == target_rank) + { + MPI_Send( §or_rank, 1, MPI_INT, 0, 0, Sector_Comm); + TAKE_TIMEwt( _twt_ ); + memcpy(gridss, Me.swins[Me.Rank[myHOST]].ptr+isector*size_of_grid*sizeof(double), + size_of_grid * sizeof(double)); + ADD_TIMEwt( mmove, _twt_); + } - MPI_Comm_free( &Sector_Comm ); - } - } + if( sector_rank == 0 ) + { + MPI_Status status; + MPI_Recv( §or_target, 1, MPI_INT, MPI_ANY_SOURCE, 0, Sector_Comm, &status); + } - - #else // relates to #ifdef ONE_SIDE + TAKE_TIMEwt(_twt_); + MPI_Bcast( §or_target, 1, MPI_INT, 0, Sector_Comm ); + + MPI_Reduce(gridss, grid, size_of_grid, MPI_DOUBLE,MPI_SUM, sector_target, Sector_Comm); + + MPI_Comm_free( &Sector_Comm ); + ADD_TIMEwt(mpi, _twt_); + } + } + ADD_TIME(reduce_mpi, twt, tpr); - MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); - #endif // closes #ifdef ONE_SIDE - #endif // closes USE_MPI - - clock_gettime(CLOCK_MONOTONIC, &finishk); - endk = clock(); - timing.reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; - timing.reduce_time1 += (finishk.tv_sec - begink.tv_sec); - timing.reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; - // Go to next sector - for (long inull=0; inull<2*param.num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0; - - // Deallocate all sector arrays - free(uus); - free(vvs); - free(wws); - free(weightss); - free(visreals); - free(visimgs); + #else // relates to #ifdef ONE_SIDE + + { + double _twt_; + TAKE_TIMEwt(_twt_); + MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); + ADD_TIMEwt(mpi, _twt_); + } + + #endif // closes #ifdef ONE_SIDE + #endif // closes USE_MPI + + ADD_TIME(reduce, twt_r, tpr_r); + + + // wipe before getting to the next sector + memset((void*)gridss, 0, size_of_grid * sizeof(double)); + + // Deallocate all sector arrays + free(uus); + free(vvs); + free(wws); + free(weightss); + free(visreals); + free(visimgs); // End of loop over sector } - // Finalize MPI communication - #ifdef USE_MPI - MPI_Win_fence(0,slabwin); - #endif - #ifndef USE_MPI - fclose(file.pFile1); - #endif + free( histo_send ); - #ifdef USE_MPI - MPI_Barrier(MPI_COMM_WORLD); - #endif + #ifndef USE_MPI + fclose(file.pFile1); + #endif - end = clock(); - clock_gettime(CLOCK_MONOTONIC, &finish); - timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC; - timing.process_time1 = (finish.tv_sec - begin.tv_sec); - timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; - clock_gettime(CLOCK_MONOTONIC, &begin); + #ifdef USE_MPI + MPI_Win_fence(0,slabwin); + MPI_Barrier(MPI_COMM_WORLD); + #endif + } void write_grided_data() @@ -355,76 +365,77 @@ void write_grided_data() printf("WRITING GRIDDED DATA\n"); file.pFilereal = fopen (out.outfile2,"wb"); file.pFileimg = fopen (out.outfile3,"wb"); - #ifdef USE_MPI - for (int isector=0; isector<nsectors; isector++) - { - MPI_Win_lock(MPI_LOCK_SHARED,isector,0,slabwin); - MPI_Get(gridss,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,slabwin); - MPI_Win_unlock(isector,slabwin); - for (long i=0; i<size_of_grid/2; i++) + + #ifdef USE_MPI + for (int isector=0; isector<nsectors; isector++) + { + MPI_Win_lock(MPI_LOCK_SHARED,isector,0,slabwin); + MPI_Get(gridss,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,slabwin); + MPI_Win_unlock(isector,slabwin); + for (long i=0; i<size_of_grid/2; i++) { - gridss_real[i] = gridss[2*i]; - gridss_img[i] = gridss[2*i+1]; + gridss_real[i] = gridss[2*i]; + gridss_img[i] = gridss[2*i+1]; } - if (param.num_w_planes > 1) + if (param.num_w_planes > 1) { - for (int iw=0; iw<param.num_w_planes; iw++) - for (int iv=0; iv<yaxis; iv++) - for (int iu=0; iu<xaxis; iu++) - { - long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); - long index = iu + iv*xaxis + iw*xaxis*yaxis; - fseek(file.pFilereal, global_index, SEEK_SET); - fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal); - } - for (int iw=0; iw<param.num_w_planes; iw++) - for (int iv=0; iv<yaxis; iv++) - for (int iu=0; iu<xaxis; iu++) - { - long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); - long index = iu + iv*xaxis + iw*xaxis*yaxis; - fseek(file.pFileimg, global_index, SEEK_SET); - fwrite(&gridss_img[index], 1, sizeof(double), file.pFileimg); - //double v_norm = sqrt(gridss[index]*gridss[index]+gridss[index+1]*gridss[index+1]); - //fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,isector*yaxis+iv,iw,gridss[index],gridss[index+1],v_norm); - } - + for (int iw=0; iw<param.num_w_planes; iw++) + for (int iv=0; iv<yaxis; iv++) + for (int iu=0; iu<xaxis; iu++) + { + long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); + long index = iu + iv*xaxis + iw*xaxis*yaxis; + fseek(file.pFilereal, global_index, SEEK_SET); + fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal); + } + + for (int iw=0; iw<param.num_w_planes; iw++) + for (int iv=0; iv<yaxis; iv++) + for (int iu=0; iu<xaxis; iu++) + { + long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); + long index = iu + iv*xaxis + iw*xaxis*yaxis; + fseek(file.pFileimg, global_index, SEEK_SET); + fwrite(&gridss_img[index], 1, sizeof(double), file.pFileimg); + //double v_norm = sqrt(gridss[index]*gridss[index]+gridss[index+1]*gridss[index+1]); + //fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,isector*yaxis+iv,iw,gridss[index],gridss[index+1],v_norm); + } + } - else + else { - for (int iw=0; iw<param.num_w_planes; iw++) - { - long global_index = (xaxis*isector*yaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); - long index = iw*xaxis*yaxis; - fseek(file.pFilereal, global_index, SEEK_SET); - fwrite(&gridss_real[index], xaxis*yaxis, sizeof(double), file.pFilereal); - fseek(file.pFileimg, global_index, SEEK_SET); - fwrite(&gridss_img[index], xaxis*yaxis, sizeof(double), file.pFileimg); - } + for (int iw=0; iw<param.num_w_planes; iw++) + { + long global_index = (xaxis*isector*yaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double); + long index = iw*xaxis*yaxis; + fseek(file.pFilereal, global_index, SEEK_SET); + fwrite(&gridss_real[index], xaxis*yaxis, sizeof(double), file.pFilereal); + fseek(file.pFileimg, global_index, SEEK_SET); + fwrite(&gridss_img[index], xaxis*yaxis, sizeof(double), file.pFileimg); + } } } #else - for (int iw=0; iw<param.num_w_planes; iw++) - for (int iv=0; iv<param.grid_size_y; iv++) - for (int iu=0; iu<param.grid_size_x; iu++) - { - long index = 2*(iu + iv*param.grid_size_x + iw*param.grid_size_x*param.grid_size_y); - fwrite(&gridtot[index], 1, sizeof(double), file.pFilereal); - fwrite(&gridtot[index+1], 1, sizeof(double), file.pFileimg); - //double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]); - //fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm); - } - #endif + for (int iw=0; iw<param.num_w_planes; iw++) + for (int iv=0; iv<param.grid_size_y; iv++) + for (int iu=0; iu<param.grid_size_x; iu++) + { + long index = 2*(iu + iv*param.grid_size_x + iw*param.grid_size_x*param.grid_size_y); + fwrite(&gridtot[index], 1, sizeof(double), file.pFilereal); + fwrite(&gridtot[index+1], 1, sizeof(double), file.pFileimg); + //double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]); + //fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm); + } + #endif fclose(file.pFilereal); fclose(file.pFileimg); } - - #ifdef USE_MPI - MPI_Win_fence(0,slabwin); - #endif - - #endif //WRITE_DATA - + + #ifdef USE_MPI + MPI_Win_fence(0,slabwin); + #endif + + #endif //WRITE_DATA } @@ -434,7 +445,8 @@ void reduce( int sector, int target_rank ) { int local_rank = Me.Rank[myHOST]; - + int target_rank_on_myhost = -1; + if( Me.Ranks_to_host[ target_rank ] == Me.myhost ) // exchange rank 0 with target rank // in this way the following log2 alogorithm, @@ -442,16 +454,32 @@ void reduce( int sector, int target_rank ) // every target rank { - int r = 0; - while( Me.Ranks_to_myhost[r] != target_rank ) - r++; + target_rank_on_myhost = 0; + while( Me.Ranks_to_myhost[target_rank_on_myhost] != target_rank ) + target_rank_on_myhost++; + + dprintf(2, Me.Rank[myHOST], 0, + "[SEC %d] swapping Host master with target rank %d (%d)\n", + sector, target_rank, target_rank_on_myhost); - if( r > 0 ) - { + + if( target_rank_on_myhost > 0 ) + // the target is not the task that already has rank 0 + // on my host + { + if( local_rank == 0 ) - local_rank = r; - if( local_rank == r ) + local_rank = target_rank_on_myhost; + else if( local_rank == target_rank_on_myhost ) local_rank = 0; + + win_t temp = Me.swins[target_rank_on_myhost]; + Me.swins[target_rank_on_myhost] = Me.swins[0]; + Me.swins[0] = temp; + + temp = Me.scwins[target_rank_on_myhost]; + Me.scwins[target_rank_on_myhost] = Me.scwins[0]; + Me.scwins[0] = temp; } } @@ -466,10 +494,14 @@ void reduce( int sector, int target_rank ) for(int l = 0; l < max_level; l++) { int threshold = 1 << (1+l); - + if( local_rank % threshold == 0) - { + { int source = local_rank + (1<<l); + dprintf(2, 0, 0, + "[SEC %d] task %d (%d) getting data from task %d at level %d\n", + sector, local_rank, Me.Rank[myHOST], source, l ); + while( *(int*)(Me.scwins[source].ptr) < l ) // sleep 5 usec if the source target is not ready NSLEEP( 5000 ); @@ -481,7 +513,24 @@ void reduce( int sector, int target_rank ) *(int*)(Me.win_ctrl.ptr) = l; } else - *(int*)(Me.win_ctrl.ptr) = l; + { + dprintf(2, 0, 0, + "[SEC %d] task %d (%d) signaling that level %d is done\n", + sector, local_rank, Me.Rank[myHOST], l ); + + *(int*)(Me.win_ctrl.ptr) = l; + } + } + + if ( target_rank_on_myhost > 0 ) + { + win_t temp = Me.swins[target_rank_on_myhost]; + Me.swins[target_rank_on_myhost] = Me.swins[0]; + Me.swins[0] = temp; + + temp = Me.scwins[target_rank_on_myhost]; + Me.scwins[target_rank_on_myhost] = Me.scwins[0]; + Me.scwins[0] = temp; } return; diff --git a/init.c b/init.c index 8312a3b439f98e8d8c6faed57ded03c89e2e9c99..10dc736ff16b2ff4ef71edbb880f492f930b1877 100644 --- a/init.c +++ b/init.c @@ -7,8 +7,7 @@ void init(int index) { - clock_gettime(CLOCK_MONOTONIC, &begin0); - start0 = clock(); + TAKE_TIME_START(total); // DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization dx = 1.0/(double)param.grid_size_x; @@ -37,8 +36,7 @@ void init(int index) printf("\nTask %d sees %d topology levels\n", global_rank, Me.MAXl); #endif - clock_gettime(CLOCK_MONOTONIC, &begin); - start = clock(); + TAKE_TIME_START(setup); // INPUT FILES (only the first ndatasets entries are used) strcpy(datapath,param.datapath_multi[index]); @@ -46,31 +44,25 @@ void init(int index) //Changing the output file names op_filename(); - + // Read metadata fileName(datapath, in.metafile); readMetaData(filename); - + // Local Calculation metaData_calculation(); - + // Allocate Data Buffer allocate_memory(); - + // Reading Data readData(); - - #ifdef USE_MPI - MPI_Barrier(MPI_COMM_WORLD); - #endif - - clock_gettime(CLOCK_MONOTONIC, &finish); - end = clock(); - timing.setup_time = ((double) (end - start)) / CLOCKS_PER_SEC; - timing.setup_time1 = (finish.tv_sec - begin.tv_sec); - timing.setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; - + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif + + TAKE_TIME_STOP(setup); } void op_filename() { @@ -233,6 +225,11 @@ void read_parameter_file(char *fname) { strcpy(outparam.timingfile, buf2); } + if(strcmp(buf1, "verbose_level") == 0) + { + verbose_level = atoi(buf1); + } + if(param.ndatasets > 1) { @@ -257,11 +254,14 @@ void read_parameter_file(char *fname) /* Communicating the relevent parameters to the other process */ - #ifdef USE_MPI - MPI_Bcast(&in, sizeof(struct ip), MPI_BYTE, 0, MPI_COMM_WORLD); - MPI_Bcast(&outparam, sizeof(struct op), MPI_BYTE, 0, MPI_COMM_WORLD); - MPI_Bcast(¶m, sizeof(struct parameter), MPI_BYTE, 0, MPI_COMM_WORLD); - #endif + #ifdef USE_MPI + double twt; + TAKE_TIMEwt(twt); + MPI_Bcast(&in, sizeof(struct ip), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Bcast(&outparam, sizeof(struct op), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Bcast(¶m, sizeof(struct parameter), MPI_BYTE, 0, MPI_COMM_WORLD); + ADD_TIMEwt(mpi, twt); + #endif } @@ -312,7 +312,8 @@ void readMetaData(char fileLocal[1000]) { void metaData_calculation() { int nsub = 1000; - printf("Subtracting last %d measurements\n",nsub); + if( global_rank == 0 ) + printf("Subtracting last %d measurements\n",nsub); metaData.Nmeasures = metaData.Nmeasures-nsub; metaData.Nvis = metaData.Nmeasures*metaData.freq_per_chan*metaData.polarisations; diff --git a/numa.c b/numa.c index fe5867b9cbfd203f30a431a707be23193fd00e69..e880caf8b0362437a0ba10820d988daf5deb26fa 100644 --- a/numa.c +++ b/numa.c @@ -62,9 +62,11 @@ int numa_init( int Rank, int Size, MPI_Comm *MYWORLD, map_t *Me ) // at my shared-memory level // for( int t = 0; t < Me->Ntasks[SHMEMl]; t++ ) - if( t != Me->Rank[SHMEMl] ) - MPI_Win_shared_query( Me->win_ctrl.win, t, &(Me->scwins[t].size), - &(Me->scwins[t].disp), &(Me->scwins[t].ptr) ); + { + //if( t != Me->Rank[SHMEMl] ) + MPI_Win_shared_query( Me->win_ctrl.win, t, &(Me->scwins[t].size), + &(Me->scwins[t].disp), &(Me->scwins[t].ptr) ); + } if( Me->Rank[SHMEMl] != 0 ) MPI_Win_shared_query( win_ctrl_hostmaster, 0, &(win_ctrl_hostmaster_size), diff --git a/result.c b/result.c index 15d310062a2db517902525bf3ac47cb351fe5bc5..7f20ffbd88fbb6d0a5cf2d2acfe01262f3a6f607 100644 --- a/result.c +++ b/result.c @@ -2,51 +2,64 @@ #include "allvars.h" #include "proto.h" -void write_result() { - - end = clock(); - clock_gettime(CLOCK_MONOTONIC, &finish); - timing.tot_time = ((double) (end - start0)) / CLOCKS_PER_SEC; - timing.tot_time1 = (finish.tv_sec - begin0.tv_sec); - timing.tot_time1 += (finish.tv_nsec - begin0.tv_nsec) / 1000000000.0; - - if (global_rank == 0) - { - printf("Setup time: %f sec\n",timing.setup_time); - printf("Process time: %f sec\n",timing.process_time); - printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",timing.kernel_time,timing.compose_time,timing.reduce_time); - #ifdef USE_FFTW - printf("FFTW time: %f sec\n",timing.fftw_time); - printf("Phase time: %f sec\n",timing.phase_time); - #endif - printf("TOT time: %f sec\n",timing.tot_time); - if(param.num_threads > 1) - { - printf("PSetup time: %f sec\n",timing.setup_time1); - printf("PProcess time: %f sec\n",timing.process_time1); - printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",timing.kernel_time1,timing.compose_time1,timing.reduce_time1); - #ifdef USE_FFTW - printf("PFFTW time: %f sec\n",timing.fftw_time1); - printf("PPhase time: %f sec\n",timing.phase_time1); - #endif - printf("PTOT time: %f sec\n",timing.tot_time1); - } - } +void write_result() +{ - if (global_rank == 0) - { - file.pFile = fopen (out.timingfile,"w"); - if (param.num_threads == 1) - { - fprintf(file.pFile, "%f %f %f %f %f %f %f\n",timing.setup_time,timing.kernel_time,timing.compose_time,timing.reduce_time,timing.fftw_time,timing.phase_time,timing.tot_time); - } else { - fprintf(file.pFile, "%f %f %f %f %f %f %f\n",timing.setup_time1,timing.kernel_time1,timing.compose_time1,timing.reduce_time1,timing.fftw_time1,timing.phase_time1,timing.tot_time1); - } - fclose(file.pFile); - } - - #ifdef USE_MPI - MPI_Win_fence(0,slabwin); - MPI_Win_free(&slabwin); - #endif + TAKE_TIME_STOP( total ); + + if (global_rank == 0) + { + printf("%14s time : %f sec\n", "Setup", wt_timing.setup); + printf("%14s time : %f sec\n", "Process", wt_timing.process); + printf("%14s time : %f sec\n", "Reduce", wt_timing.reduce); + #if defined(USE_MPI) + #if defined(ONE_SIDE) + printf("%14s time : %f sec\n", "Reduce sh", wt_timing.reduce_sh); + printf("%14s time : %f sec\n", "Mmove", wt_timing.mmove); + printf("%14s time : %f sec\n", "ReduceMPI", wt_timing.reduce_mpi); + #endif + printf("%14s time : %f sec\n", "MPI", wt_timing.mpi); + #endif + printf("%10s Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n", "", + wt_timing.kernel,wt_timing.compose,wt_timing.reduce); + #ifdef USE_FFTW + printf("%14s time : %f sec\n", "FFTW", wt_timing.fftw); + printf("%14s time : %f sec\n", "Phase",wt_timing.phase); + #endif + printf("%14s time : %f sec\n\n", "TOTAL", wt_timing.total); + + if(param.num_threads > 1) + { + printf("%14s time : %f sec\n", "PSetup", pr_timing.setup); + printf("%14s time : %f sec\n", "PProcess", pr_timing.process); + printf("%10s PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n", "", + pr_timing.kernel,pr_timing.compose,pr_timing.reduce); + #ifdef USE_FFTW + printf("%14s time : %f sec\n", "PFFTW", pr_timing.fftw); + printf("%14s time : %f sec\n", "PPhase", pr_timing.phase); + #endif + printf("%14s time : %f sec\n", "PTOTAL", pr_timing.total); + } + } + + if (global_rank == 0) + { + file.pFile = fopen (out.timingfile,"w"); + if (param.num_threads == 1) + { + fprintf(file.pFile, "%f %f %f %f %f %f %f\n", + wt_timing.setup, wt_timing.kernel, wt_timing.compose, + wt_timing.reduce,wt_timing.fftw,wt_timing.phase, wt_timing.total); + } else { + fprintf(file.pFile, "%f %f %f %f %f %f %f\n", + pr_timing.setup, pr_timing.kernel, pr_timing.compose, + pr_timing.reduce,pr_timing.fftw,pr_timing.phase, pr_timing.total); + } + fclose(file.pFile); + } + + #ifdef USE_MPI + MPI_Win_fence(0,slabwin); + MPI_Win_free(&slabwin); + #endif }