diff --git a/w-stacking-fftw-cfitsio.c b/w-stacking-fftw-cfitsio.c new file mode 100644 index 0000000000000000000000000000000000000000..ebba5b31a422598da4d7b913365fe43c2702c017 --- /dev/null +++ b/w-stacking-fftw-cfitsio.c @@ -0,0 +1,1089 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#ifdef FITSIO +#include "/m100/home/userexternal/ederubei/cfitsio-3.49/fitsio.h" +#endif +#ifdef USE_MPI +#include <mpi.h> +#ifdef USE_FFTW +#include <fftw3-mpi.h> +#endif +#endif +#include <omp.h> +#include <math.h> +#include <time.h> +#include <unistd.h> +#ifdef ACCOMP +#include "w-stacking_omp.h" +#else +#include "w-stacking.h" +#endif +#define PI 3.14159265359 +#define NUM_OF_SECTORS -1 +#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) +#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) +#define NOVERBOSE +#define NFILES 100 +#define FILENAMELENGTH 30 + +// Linked List set-up +struct sectorlist { + long index; + struct sectorlist * next; +}; + +void Push(struct sectorlist** headRef, long data) { + struct sectorlist* newNode = malloc(sizeof(struct sectorlist)); + newNode->index = data; + newNode->next = *headRef; + *headRef = newNode; +} + +// Main Code +int main(int argc, char * argv[]) +{ + // Main MPI parameters + int rank; + int size; + + + // Define main filenames + FILE * pFile; + FILE * pFile1; + FILE * pFilereal; + FILE * pFileimg; + // Global filename to be composed + char filename[1000]; + + // MS paths + char datapath[900]; + char datapath_multi[NFILES][900]; + char ufile[FILENAMELENGTH] = "ucoord.bin"; + char vfile[FILENAMELENGTH] = "vcoord.bin"; + char wfile[FILENAMELENGTH] = "wcoord.bin"; + char weightsfile[FILENAMELENGTH] = "weights.bin"; + char visrealfile[FILENAMELENGTH] = "visibilities_real.bin"; + char visimgfile[FILENAMELENGTH] = "visibilities_img.bin"; + char metafile[FILENAMELENGTH] = "meta.txt"; + + // Mesh related files + char outfile[FILENAMELENGTH] = "grid.txt"; + char outfile1[FILENAMELENGTH] = "coords.txt"; + char outfile2[FILENAMELENGTH] = "grid_real.bin"; + char outfile3[FILENAMELENGTH] = "grid_img.bin"; + char fftfile[FILENAMELENGTH] = "fft.txt"; + char fftfile2[FILENAMELENGTH] = "fft_real.bin"; + char fftfile3[FILENAMELENGTH] = "fft_img.bin"; + char logfile[FILENAMELENGTH] = "run.log"; + char extension[FILENAMELENGTH] = ".txt"; + char srank[4]; + char timingfile[FILENAMELENGTH] = "timings.dat"; + + // Visibilities related variables + double * uu; + double * vv; + double * ww; + float * weights; + float * visreal; + float * visimg; + + long Nmeasures,Nmeasures0; + long Nvis,Nvis0; + long Nweights,Nweights0; + long freq_per_chan,freq_per_chan0; + long polarisations,polarisations0; + long Ntimes,Ntimes0; + double dt,dt0; + double thours,thours0; + long baselines,baselines0; + double uvmin,uvmin0; + double uvmax,uvmax0; + double wmin,wmin0; + double wmax,wmax0; + double resolution; + + // Mesh related parameters: global size + int grid_size_x = 2048; + int grid_size_y = 2048; + // Split Mesh size (auto-calculated) + int local_grid_size_x; + int local_grid_size_y; + int xaxis; + int yaxis; + + // Number of planes in the w direction + int num_w_planes = 8; + + // Size of the convoutional kernel support + int w_support = 7; + + // Number of OpenMP threads (input parameter) + int num_threads; + + // Resolution + double dx = 0.5/(double)grid_size_x; + double dw = 0.5/(double)num_w_planes; + // Half support size + double w_supporth = (double)((w_support-1)/2)*dx; + + + // Initialize FITS image parameters + fitsfile *fptreal; + fitsfile *fptrimg; + int status; + long nelements; +// long fpixel, lpixel; + char testfitsreal[FILENAMELENGTH] = "parallel_np2_real.fits"; + char testfitsimag[FILENAMELENGTH] = "parallel_np2_img.fits"; + + long naxis = 2; + long naxes[2] = { grid_size_x, grid_size_y }; + + nelements = naxes[0] * naxes[1]; + + + + + // Internal profiling parameters + clock_t start, end, start0, startk, endk; + double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time; + double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1; + double writetime, writetime1; + struct timespec begin, finish, begin0, begink, finishk; + double elapsed; + + // Number of sectors in which the mesh is divided along the v direction + // IF nsectors < 0, nsectors = NUMBER OF MPI RANKS + long nsectors = NUM_OF_SECTORS; + + + /* Get number of threads, exit if not given */ + if (argc == 1) { + fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]); + exit(1); + } + // Set the number of OpenMP threads + num_threads = atoi(argv[1]); + + // Number of threads <= 0 not acceptable + if (num_threads <= 0) { + fprintf(stderr, "Wrong parameter: %s\n\n", argv[1]); + fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]); + exit(1); + } + + // Internal profiling + clock_gettime(CLOCK_MONOTONIC, &begin0); + start0 = clock(); + +#ifdef USE_MPI + // Intialize MPI environment + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + if(rank == 0)printf("Running with %d MPI tasks\n",size); + #ifdef USE_FFTW + // Initialize FFTW environment + fftw_mpi_init(); + #endif +#else + // If MPI is not used the two parameters are set for consistency + rank = 0; + size = 1; +#endif + + if(rank == 0)printf("Running with %d threads\n",num_threads); + +#ifdef ACCOMP +if(rank == 0){ + if (0 == omp_get_num_devices()) { + printf("No accelerator found ... exit\n"); + exit(255); + } + printf("Number of available GPUs %d\n", omp_get_num_devices()); + #ifdef NVIDIA + prtAccelInfo(); + #endif + } +#endif + + // set the local size of the image + local_grid_size_x = grid_size_x; + if (nsectors < 0) nsectors = size; + local_grid_size_y = grid_size_y/nsectors; + //nsectors = size; + + // LOCAL grid size + xaxis = local_grid_size_x; + yaxis = local_grid_size_y; + clock_gettime(CLOCK_MONOTONIC, &begin); + start = clock(); + + // INPUT FILES (only the first ndatasets entries are used) + int ndatasets = 1; + strcpy(datapath_multi[0],"/m100_scratch/userexternal/ederubei/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"); + //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/"); + //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"); + //strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/"); + + strcpy(datapath,datapath_multi[0]); + // Read metadata + strcpy(filename,datapath); + strcat(filename,metafile); + pFile = fopen (filename,"r"); + fscanf(pFile,"%ld",&Nmeasures); + fscanf(pFile,"%ld",&Nvis); + fscanf(pFile,"%ld",&freq_per_chan); + fscanf(pFile,"%ld",&polarisations); + fscanf(pFile,"%ld",&Ntimes); + fscanf(pFile,"%lf",&dt); + fscanf(pFile,"%lf",&thours); + fscanf(pFile,"%ld",&baselines); + fscanf(pFile,"%lf",&uvmin); + fscanf(pFile,"%lf",&uvmax); + fscanf(pFile,"%lf",&wmin); + fscanf(pFile,"%lf",&wmax); + fclose(pFile); + + + // WATCH THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + int nsub = 1000; + //int nsub = 10; + printf("Subtracting last %d measurements\n",nsub); + Nmeasures = Nmeasures-nsub; + Nvis = Nmeasures*freq_per_chan*polarisations; + + // calculate the coordinates of the center + double uvshift = uvmin/(uvmax-uvmin); + //printf("UVSHIFT %f %f %f %f %f\n",uvmin, uvmax, wmin, wmax, uvshift); + + if (rank == 0) + { + printf("N. measurements %ld\n",Nmeasures); + printf("N. visibilities %ld\n",Nvis); + } + + // Set temporary local size of points + long nm_pe = (long)(Nmeasures/size); + long remaining = Nmeasures%size; + + long startrow = rank*nm_pe; + if (rank == size-1)nm_pe = nm_pe+remaining; + + long Nmeasures_tot = Nmeasures; + Nmeasures = nm_pe; + long Nvis_tot = Nvis; + Nvis = Nmeasures*freq_per_chan*polarisations; + Nweights = Nmeasures*polarisations; + +#ifdef VERBOSE + printf("N. measurements on %d %ld\n",rank,Nmeasures); + printf("N. visibilities on %d %ld\n",rank,Nvis); +#endif + + + // DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used + // all the sizes are rescaled by the number of MPI tasks + // Allocate arrays + uu = (double*) calloc(Nmeasures,sizeof(double)); + vv = (double*) calloc(Nmeasures,sizeof(double)); + ww = (double*) calloc(Nmeasures,sizeof(double)); + weights = (float*) calloc(Nweights,sizeof(float)); + visreal = (float*) calloc(Nvis,sizeof(float)); + visimg = (float*) calloc(Nvis,sizeof(float)); + + if(rank == 0)printf("READING DATA\n"); + // Read data + strcpy(filename,datapath); + strcat(filename,ufile); + //printf("Reading %s\n",filename); + + pFile = fopen (filename,"rb"); + fseek (pFile,startrow*sizeof(double),SEEK_SET); + fread(uu,Nmeasures*sizeof(double),1,pFile); + fclose(pFile); + + strcpy(filename,datapath); + strcat(filename,vfile); + //printf("Reading %s\n",filename); + + pFile = fopen (filename,"rb"); + fseek (pFile,startrow*sizeof(double),SEEK_SET); + fread(vv,Nmeasures*sizeof(double),1,pFile); + fclose(pFile); + + strcpy(filename,datapath); + strcat(filename,wfile); + //printf("Reading %s\n",filename); + + pFile = fopen (filename,"rb"); + fseek (pFile,startrow*sizeof(double),SEEK_SET); + fread(ww,Nmeasures*sizeof(double),1,pFile); + fclose(pFile); + +#ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + + clock_gettime(CLOCK_MONOTONIC, &finish); + end = clock(); + setup_time = ((double) (end - start)) / CLOCKS_PER_SEC; + setup_time1 = (finish.tv_sec - begin.tv_sec); + setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; + + + if(rank == 0)printf("GRIDDING DATA\n"); + + // Create histograms and linked lists + clock_gettime(CLOCK_MONOTONIC, &begin); + start = clock(); + + // Initialize linked list + struct sectorlist ** sectorhead; + sectorhead = (struct sectorlist **) malloc((nsectors+1) * sizeof(struct sectorlist)); + for (int isec=0; isec<=nsectors; isec++) + { + sectorhead[isec] = malloc(sizeof(struct sectorlist)); + sectorhead[isec]->index = -1; + sectorhead[isec]->next = NULL; + } + + + long * histo_send = (long*) calloc(nsectors+1,sizeof(long)); + int * boundary = (int*) calloc(Nmeasures,sizeof(int)); + double uuh,vvh; + for (long iphi = 0; iphi < Nmeasures; iphi++) + { + boundary[iphi] = -1; + uuh = uu[iphi]; + vvh = vv[iphi]; + int binphi = (int)(vvh*nsectors); + // check if the point influence also neighboring slabs + double updist = (double)((binphi+1)*yaxis)*dx - vvh; + double downdist = vvh - (double)(binphi*yaxis)*dx; + // + histo_send[binphi]++; + Push(§orhead[binphi],iphi); + if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1; Push(§orhead[binphi+1],iphi);}; + if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) {histo_send[binphi-1]++; boundary[iphi] = binphi-1; Push(§orhead[binphi-1],iphi);}; + } + +#ifdef PIPPO + struct sectorlist * current; + long iiii = 0; + for (int j=0; j<nsectors; j++) + { + current = sectorhead[j]; + iiii = 0; + while (current->index != -1) + { + printf("%d %d %ld %ld %ld\n",rank,j,iiii,histo_send[j],current->index); + current = current->next; + iiii++; + } + } +#endif + +#ifdef VERBOSE + for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]); +#endif + +// Create sector grid + double * gridss; + double * gridss_w; + double * gridss_real; + double * gridss_img; + double * grid; + long size_of_grid; + size_of_grid = 2*num_w_planes*xaxis*yaxis; + gridss = (double*) calloc(size_of_grid,sizeof(double)); + gridss_w = (double*) calloc(size_of_grid,sizeof(double)); + gridss_real = (double*) calloc(size_of_grid/2,sizeof(double)); + gridss_img = (double*) calloc(size_of_grid/2,sizeof(double)); + // Create destination slab + grid = (double*) calloc(size_of_grid,sizeof(double)); + // Create temporary global grid +#ifndef USE_MPI + double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double)); +#endif + double shift = (double)(dx*yaxis); + // Open the MPI Memory Window for the slab +#ifdef USE_MPI + MPI_Win slabwin; + MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin); + MPI_Win_fence(0,slabwin); +#endif +#ifndef USE_MPI + pFile1 = fopen (outfile1,"w"); +#endif + + + // loop over files + // + kernel_time = 0.0; + kernel_time1 = 0.0; + reduce_time = 0.0; + reduce_time1 = 0.0; + compose_time = 0.0; + compose_time1 = 0.0; + + // MAIN LOOP OVER FILES + // + for (int ifiles=0; ifiles<ndatasets; ifiles++) + { + strcpy(filename,datapath_multi[ifiles]); + printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets); + + // Read metadata + strcpy(filename,datapath); + strcat(filename,metafile); + pFile = fopen (filename,"r"); + fscanf(pFile,"%ld",&Nmeasures0); + fscanf(pFile,"%ld",&Nvis0); + fscanf(pFile,"%ld",&freq_per_chan0); + fscanf(pFile,"%ld",&polarisations0); + fscanf(pFile,"%ld",&Ntimes0); + fscanf(pFile,"%lf",&dt0); + fscanf(pFile,"%lf",&thours0); + fscanf(pFile,"%ld",&baselines0); + fscanf(pFile,"%lf",&uvmin); + fscanf(pFile,"%lf",&uvmax); + fscanf(pFile,"%lf",&wmin0); + fscanf(pFile,"%lf",&wmax0); + fclose(pFile); + + // calculate the resolution in radians + resolution = 1.0/MAX(abs(uvmin),abs(uvmax)); + // calculate the resolution in arcsec + double resolution_asec = (3600.0*180.0)/MAX(abs(uvmin),abs(uvmax))/PI; + printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); + + strcpy(filename,datapath); + strcat(filename,weightsfile); + pFile = fopen (filename,"rb"); + fseek (pFile,startrow*polarisations*sizeof(float),SEEK_SET); + fread(weights,(Nweights)*sizeof(float),1,pFile); + fclose(pFile); + + strcpy(filename,datapath); + strcat(filename,visrealfile); + + pFile = fopen (filename,"rb"); + fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET); + fread(visreal,Nvis*sizeof(float),1,pFile); + fclose(pFile); + strcpy(filename,datapath); + strcat(filename,visimgfile); +#ifdef VERBOSE + printf("Reading %s\n",filename); +#endif + pFile = fopen (filename,"rb"); + fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET); + fread(visimg,Nvis*sizeof(float),1,pFile); + fclose(pFile); + +#ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + // Declare temporary arrays for the masking + double * uus; + double * vvs; + double * wws; + float * visreals; + float * visimgs; + float * weightss; + long isector; + + for (long isector_count=0; isector_count<nsectors; isector_count++) + { + clock_gettime(CLOCK_MONOTONIC, &begink); + startk = clock(); + // define local destination sector + //isector = (isector_count+rank)%size; + isector = isector_count; + // allocate sector arrays + long Nsec = histo_send[isector]; + uus = (double*) malloc(Nsec*sizeof(double)); + vvs = (double*) malloc(Nsec*sizeof(double)); + wws = (double*) malloc(Nsec*sizeof(double)); + long Nweightss = Nsec*polarisations; + long Nvissec = Nweightss*freq_per_chan; + weightss = (float*) malloc(Nweightss*sizeof(float)); + visreals = (float*) malloc(Nvissec*sizeof(float)); + visimgs = (float*) malloc(Nvissec*sizeof(float)); + + // select data for this sector + long icount = 0; + long ip = 0; + long inu = 0; +//CLAAAA + struct sectorlist * current; + current = sectorhead[isector]; + + + + while (current->index != -1) + { + long ilocal = current->index; + //double vvh = vv[ilocal]; + //int binphi = (int)(vvh*nsectors); + //if (binphi == isector || boundary[ilocal] == isector) { + uus[icount] = uu[ilocal]; + vvs[icount] = vv[ilocal]-isector*shift; + wws[icount] = ww[ilocal]; + for (long ipol=0; ipol<polarisations; ipol++) + { + weightss[ip] = weights[ilocal*polarisations+ipol]; + ip++; + } + for (long ifreq=0; ifreq<polarisations*freq_per_chan; ifreq++) + { + visreals[inu] = visreal[ilocal*polarisations*freq_per_chan+ifreq]; + visimgs[inu] = visimg[ilocal*polarisations*freq_per_chan+ifreq]; + //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*polarisations*freq_per_chan+ifreq,Nvis); + inu++; + } + icount++; + //} + current = current->next; + } + + clock_gettime(CLOCK_MONOTONIC, &finishk); + endk = clock(); + compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; + compose_time1 += (finishk.tv_sec - begink.tv_sec); + compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; + + #ifndef USE_MPI + double uumin = 1e20; + double vvmin = 1e20; + double uumax = -1e20; + double vvmax = -1e20; + + for (long ipart=0; ipart<Nsec; ipart++) + { + uumin = MIN(uumin,uus[ipart]); + uumax = MAX(uumax,uus[ipart]); + vvmin = MIN(vvmin,vvs[ipart]); + vvmax = MAX(vvmax,vvs[ipart]); + + + if(ipart%10 == 0)fprintf (pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]); + } + + printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); + #endif + + // Make convolution on the grid + #ifdef VERBOSE + printf("Processing sector %ld\n",isector); + #endif + clock_gettime(CLOCK_MONOTONIC, &begink); + startk = clock(); + + wstack(num_w_planes, + Nsec, + freq_per_chan, + polarisations, + uus, + vvs, + wws, + visreals, + visimgs, + weightss, + dx, + dw, + w_support, + xaxis, + yaxis, + gridss, + num_threads); + + +/* int z =0 ; +#pragma omp target map(to:test_i_gpu) map(from:z) +{ + int x; // only accessible from accelerator + x = 2; + z = x + test_i_gpu; +}*/ + + + + clock_gettime(CLOCK_MONOTONIC, &finishk); + endk = clock(); + kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; + kernel_time1 += (finishk.tv_sec - begink.tv_sec); + kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; + #ifdef VERBOSE + printf("Processed sector %ld\n",isector); + #endif + clock_gettime(CLOCK_MONOTONIC, &begink); + startk = clock(); + + //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]); + + #ifndef USE_MPI + long stride = isector*2*xaxis*yaxis*num_w_planes; + for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)gridtot[stride+iii] = gridss[iii]; + #endif + + // Write grid in the corresponding remote slab + #ifdef USE_MPI + int target_rank = (int)isector; + //int target_rank = (int)(size-isector-1); + #ifdef ONE_SIDE + printf("One Side communication active\n"); + MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin); + MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); + MPI_Win_unlock(target_rank,slabwin); + //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin); + #else + MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); + #endif //ONE_SIDE + #endif //USE_MPI + + clock_gettime(CLOCK_MONOTONIC, &finishk); + endk = clock(); + reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; + reduce_time1 += (finishk.tv_sec - begink.tv_sec); + reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; + // Go to next sector + for (long inull=0; inull<2*num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0; + + // Deallocate all sector arrays + free(uus); + free(vvs); + free(wws); + free(weightss); + free(visreals); + free(visimgs); + // End of loop over sectors + } + // End of loop over input files + } + + // Finalize MPI communication + #ifdef USE_MPI + MPI_Win_fence(0,slabwin); + #endif + + // Swap left and right parts APPARENTLY NOT NECESSARY + /* + for (long kswap=0; kswap<num_w_planes; kswap++) + for (long jswap=0; jswap<yaxis; jswap++) + for (long iswap=0; iswap<xaxis; iswap++) + { + long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis); + gridss[index_origin] = grid[index_origin]; + gridss[index_origin+1] = grid[index_origin+1]; + } + for (long kswap=0; kswap<num_w_planes; kswap++) + for (long jswap=0; jswap<yaxis; jswap++) + for (long iswap=0; iswap<xaxis/2; iswap++) + { + long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis); + long index_destination = 2*(iswap+xaxis/2 + jswap*xaxis + kswap*yaxis*xaxis); + grid[index_destination] = gridss[index_origin]; + grid[index_destination+1] = gridss[index_origin+1]; + } + + for (long kswap=0; kswap<num_w_planes; kswap++) + for (long jswap=0; jswap<yaxis; jswap++) + for (long iswap=xaxis/2; iswap<xaxis; iswap++) + { + long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis); + long index_destination = 2*(iswap-xaxis/2 + jswap*xaxis + kswap*yaxis*xaxis); + grid[index_destination] = gridss[index_origin]; + grid[index_destination+1] = gridss[index_origin+1]; + } + */ + + #ifndef USE_MPI + fclose(pFile1); + #endif + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif + + end = clock(); + clock_gettime(CLOCK_MONOTONIC, &finish); + process_time = ((double) (end - start)) / CLOCKS_PER_SEC; + process_time1 = (finish.tv_sec - begin.tv_sec); + process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; + clock_gettime(CLOCK_MONOTONIC, &begin); + + +#ifdef WRITE_DATA + // Write results + if (rank == 0) + { + printf("WRITING GRIDDED DATA\n"); + pFilereal = fopen (outfile2,"wb"); + pFileimg = fopen (outfile3,"wb"); + #ifdef USE_MPI + for (int isector=0; isector<nsectors; isector++) + { + MPI_Win_lock(MPI_LOCK_SHARED,isector,0,slabwin); + MPI_Get(gridss,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,slabwin); + MPI_Win_unlock(isector,slabwin); + for (long i=0; i<size_of_grid/2; i++) + { + gridss_real[i] = gridss[2*i]; + gridss_img[i] = gridss[2*i+1]; + } + if (num_w_planes > 1) + { + for (int iw=0; iw<num_w_planes; iw++) + for (int iv=0; iv<yaxis; iv++) + for (int iu=0; iu<xaxis; iu++) + { + long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double); + long index = iu + iv*xaxis + iw*xaxis*yaxis; + fseek(pFilereal, global_index, SEEK_SET); + fwrite(&gridss_real[index], 1, sizeof(double), pFilereal); + } + for (int iw=0; iw<num_w_planes; iw++) + for (int iv=0; iv<yaxis; iv++) + for (int iu=0; iu<xaxis; iu++) + { + long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double); + long index = iu + iv*xaxis + iw*xaxis*yaxis; + fseek(pFileimg, global_index, SEEK_SET); + fwrite(&gridss_img[index], 1, sizeof(double), pFileimg); + //double v_norm = sqrt(gridss[index]*gridss[index]+gridss[index+1]*gridss[index+1]); + //fprintf (pFile, "%d %d %d %f %f %f\n", iu,isector*yaxis+iv,iw,gridss[index],gridss[index+1],v_norm); + } + + } else { + for (int iw=0; iw<num_w_planes; iw++) + { + long global_index = (xaxis*isector*yaxis + iw*grid_size_x*grid_size_y)*sizeof(double); + long index = iw*xaxis*yaxis; + fseek(pFilereal, global_index, SEEK_SET); + fwrite(&gridss_real[index], xaxis*yaxis, sizeof(double), pFilereal); + fseek(pFileimg, global_index, SEEK_SET); + fwrite(&gridss_img[index], xaxis*yaxis, sizeof(double), pFileimg); + } + } + } + #else + for (int iw=0; iw<num_w_planes; iw++) + for (int iv=0; iv<grid_size_y; iv++) + for (int iu=0; iu<grid_size_x; iu++) + { + long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y); + fwrite(&gridtot[index], 1, sizeof(double), pFilereal); + fwrite(&gridtot[index+1], 1, sizeof(double), pFileimg); + //double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]); + //fprintf (pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm); + } + #endif + fclose(pFilereal); + fclose(pFileimg); + } + + #ifdef USE_MPI + MPI_Win_fence(0,slabwin); + #endif +#endif //WRITE_DATA + + +#ifdef USE_FFTW + // FFT transform the data (using distributed FFTW) + + if(rank == 0)printf("PERFORMING FFT\n"); + clock_gettime(CLOCK_MONOTONIC, &begin); + start = clock(); + fftw_plan plan; + fftw_complex *fftwgrid; + ptrdiff_t alloc_local, local_n0, local_0_start; + double norm = 1.0/(double)(grid_size_x*grid_size_y); + + // map the 1D array of complex visibilities to a 2D array required by FFTW (complex[*][2]) + // x is the direction of contiguous data and maps to the second parameter + // y is the parallelized direction and corresponds to the first parameter (--> n0) + // and perform the FFT per w plane + alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start); + fftwgrid = fftw_alloc_complex(alloc_local); + plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); + + long fftwindex = 0; + long fftwindex2D = 0; + for (int iw=0; iw<num_w_planes; iw++) + { + //printf("FFTing plan %d\n",iw); + // select the w-plane to transform + for (int iv=0; iv<yaxis; iv++) + { + for (int iu=0; iu<xaxis; iu++) + { + fftwindex2D = iu + iv*xaxis; + fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); + fftwgrid[fftwindex2D][0] = grid[fftwindex]; + fftwgrid[fftwindex2D][1] = grid[fftwindex+1]; + } + } + + // do the transform for each w-plane + fftw_execute(plan); + + // save the transformed w-plane + for (int iv=0; iv<yaxis; iv++) + { + for (int iu=0; iu<xaxis; iu++) + { + fftwindex2D = iu + iv*xaxis; + fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); + gridss[fftwindex] = norm*fftwgrid[fftwindex2D][0]; + gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D][1]; + } + } + + } + + fftw_destroy_plan(plan); + + #ifdef USE_MPI + MPI_Win_fence(0,slabwin); + MPI_Barrier(MPI_COMM_WORLD); + #endif + + end = clock(); + clock_gettime(CLOCK_MONOTONIC, &finish); + fftw_time = ((double) (end - start)) / CLOCKS_PER_SEC; + fftw_time1 = (finish.tv_sec - begin.tv_sec); + fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; + clock_gettime(CLOCK_MONOTONIC, &begin); + +#ifdef WRITE_DATA + // Write results + #ifdef USE_MPI + MPI_Win writewin; + MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin); + MPI_Win_fence(0,writewin); + #endif + if (rank == 0) + { + printf("WRITING FFT TRANSFORMED DATA\n"); + pFilereal = fopen (fftfile2,"wb"); + pFileimg = fopen (fftfile3,"wb"); + #ifdef USE_MPI + for (int isector=0; isector<nsectors; isector++) + { + MPI_Win_lock(MPI_LOCK_SHARED,isector,0,writewin); + MPI_Get(gridss_w,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,writewin); + MPI_Win_unlock(isector,writewin); + for (long i=0; i<size_of_grid/2; i++) + { + gridss_real[i] = gridss_w[2*i]; + gridss_img[i] = gridss_w[2*i+1]; + } + if (num_w_planes > 1) + { + for (int iw=0; iw<num_w_planes; iw++) + for (int iv=0; iv<yaxis; iv++) + for (int iu=0; iu<xaxis; iu++) + { + long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double); + long index = iu + iv*xaxis + iw*xaxis*yaxis; + fseek(pFilereal, global_index, SEEK_SET); + fwrite(&gridss_real[index], 1, sizeof(double), pFilereal); + } + for (int iw=0; iw<num_w_planes; iw++) + for (int iv=0; iv<yaxis; iv++) + for (int iu=0; iu<xaxis; iu++) + { + long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double); + long index = iu + iv*xaxis + iw*xaxis*yaxis; + fseek(pFileimg, global_index, SEEK_SET); + fwrite(&gridss_img[index], 1, sizeof(double), pFileimg); + } + } else { + fwrite(gridss_real, size_of_grid/2, sizeof(double), pFilereal); + fwrite(gridss_img, size_of_grid/2, sizeof(double), pFileimg); + } + + + } + #else + /* + for (int iw=0; iw<num_w_planes; iw++) + for (int iv=0; iv<grid_size_y; iv++) + for (int iu=0; iu<grid_size_x; iu++) + { + int isector = 0; + long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y); + double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]); + fprintf (pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm); + } + */ + #endif + fclose(pFilereal); + fclose(pFileimg); + } + + #ifdef USE_MPI + //MPI_Win_fence(0,writewin); + MPI_Win_fence(0,writewin); + MPI_Win_free(&writewin); + MPI_Barrier(MPI_COMM_WORLD); + #endif +#endif //WRITE_DATA + + fftw_free(fftwgrid); + + // Phase correction + clock_gettime(CLOCK_MONOTONIC, &begin); + start = clock(); + if(rank == 0)printf("PHASE CORRECTION\n"); + double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double)); + double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double)); + + phase_correction(gridss,image_real,image_imag,xaxis,yaxis,num_w_planes,grid_size_x,grid_size_y,resolution,wmin,wmax,num_threads); + + end = clock(); + clock_gettime(CLOCK_MONOTONIC, &finish); + phase_time = ((double) (end - start)) / CLOCKS_PER_SEC; + phase_time1 = (finish.tv_sec - begin.tv_sec); + phase_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; +#ifdef WRITE_IMAGE + + if(rank == 0) + { + #ifdef FITSIO + printf("REMOVING RESIDUAL FITS FILE\n"); + remove(testfitsreal); + remove(testfitsimag); + + + printf("FITS CREATING\n"); + status = 0; + + fits_create_file(&fptrimg, testfitsimag, &status); + fits_create_img(fptrimg, DOUBLE_IMG, naxis, naxes, &status); +// fits_write_img(fptrimg, TDOUBLE, fpixel, nelements, image_imag, &status); + fits_close_file(fptrimg, &status); + + status = 0; + + fits_create_file(&fptreal, testfitsreal, &status); + fits_create_img(fptreal, DOUBLE_IMG, naxis, naxes, &status); +// fits_write_img(fptreal, TDOUBLE, fpixel, nelements, image_real, &status); + fits_close_file(fptreal, &status); + #endif + + pFilereal = fopen (fftfile2, "wb"); + pFileimg = fopen (fftfile3, "wb"); + fclose(pFilereal); + fclose(pFileimg); + } + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif + if(rank == 0)printf("WRITING IMAGE\n"); + for (int isector=0; isector<size; isector++) + { + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif + if(isector == rank) + //if(rank == 0) + { + #ifdef FITSIO + + printf("%d writing\n",isector); + long * fpixel = (long *) malloc(sizeof(long)*naxis); + long * lpixel = (long *) malloc(sizeof(long)*naxis); + + + fpixel[0] = 1; + fpixel[1] = isector*yaxis+1; + lpixel[0] = xaxis; + lpixel[1] = (isector+1)*yaxis; + //printf("fpixel %d, %d\n", fpixel[0], fpixel[1]); + //printf("lpixel %d, %d\n", lpixel[0], lpixel[1]); + status = 0; + fits_open_image(&fptreal, testfitsreal, READWRITE, &status); + fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status); + fits_close_file(fptreal, &status); + + status = 0; + fits_open_image(&fptrimg, testfitsimag, READWRITE, &status); + fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status); + fits_close_file(fptrimg, &status); + + #endif + + pFilereal = fopen (fftfile2,"ab"); + pFileimg = fopen (fftfile3,"ab"); + + long global_index = isector*(xaxis*yaxis)*sizeof(double); + + fseek(pFilereal, global_index, SEEK_SET); + fwrite(image_real, xaxis*yaxis, sizeof(double), pFilereal); + fseek(pFileimg, global_index, SEEK_SET); + fwrite(image_imag, xaxis*yaxis, sizeof(double), pFileimg); + + fclose(pFilereal); + fclose(pFileimg); + } + } + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif + +#endif //WRITE_IMAGE + + +#endif //USE_FFTW + + end = clock(); + clock_gettime(CLOCK_MONOTONIC, &finish); + tot_time = ((double) (end - start0)) / CLOCKS_PER_SEC; + tot_time1 = (finish.tv_sec - begin0.tv_sec); + tot_time1 += (finish.tv_nsec - begin0.tv_nsec) / 1000000000.0; + + if (rank == 0) + { + printf("Setup time: %f sec\n",setup_time); + printf("Process time: %f sec\n",process_time); + printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",kernel_time,compose_time,reduce_time); +#ifdef USE_FFTW + printf("FFTW time: %f sec\n",fftw_time); + printf("Phase time: %f sec\n",phase_time); +#endif + printf("TOT time: %f sec\n",tot_time); + if(num_threads > 1) + { + printf("PSetup time: %f sec\n",setup_time1); + printf("PProcess time: %f sec\n",process_time1); + printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",kernel_time1,compose_time1,reduce_time1); +#ifdef USE_FFTW + printf("PFFTW time: %f sec\n",fftw_time1); + printf("PPhase time: %f sec\n",phase_time1); +#endif + printf("PTOT time: %f sec\n",tot_time1); + } + } + + if (rank == 0) + { + pFile = fopen (timingfile,"w"); + if (num_threads == 1) + { + fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time,kernel_time,compose_time,reduce_time,fftw_time,phase_time,tot_time); + } else { + fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time1,kernel_time1,compose_time1,reduce_time1,fftw_time1,phase_time1,tot_time1); + } + fclose(pFile); + } + + // Close MPI environment + #ifdef USE_MPI + MPI_Win_fence(0,slabwin); + MPI_Win_free(&slabwin); + MPI_Finalize(); + #endif +}