Skip to content
Snippets Groups Projects
Select Git revision
  • 64ea935991cbeb6172f3be5535f2d4689b2a6e1f
  • master default protected
  • offload_trapping
  • script_devel
  • parallel_trapping
  • unify_iterations
  • containers-m10
  • magma_refinement
  • release9
  • enable_svd
  • parallel_angles_gmu
  • containers-m8
  • parallel_angles
  • profile_omp_leonardo
  • test_nvidia_profiler
  • containers
  • shaditest
  • test1
  • main
  • 3-error-in-run-the-program
  • experiment
  • NP_TMcode-M10a.03
  • NP_TMcode-M10a.02
  • NP_TMcode-M10a.01
  • NP_TMcode-M10a.00
  • NP_TMcode-M9.01
  • NP_TMcode-M9.00
  • NP_TMcode-M8.03
  • NP_TMcode-M8.02
  • NP_TMcode-M8.01
  • NP_TMcode-M8.00
  • NP_TMcode-M7.00
  • v0.0
33 results

README.md

Blame
  • To learn more about this project, read the wiki.
    w-stacking-fftw.c 30.79 KiB
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #ifdef USE_MPI
    #include <mpi.h>
    #ifdef USE_FFTW
    #include <fftw3-mpi.h>
    #endif
    #endif
    #include <math.h>
    #include <time.h>
    #include "w-stacking.h"
    #define PI 3.14159265359
    #define NUM_OF_SECTORS -1
    #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
    #define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
    #define NOVERBOSE
    #define NFILES 10
    
    // Linked List set-up
    struct sectorlist {
         long index;
         struct sectorlist * next;
    };
    
    void Push(struct sectorlist** headRef, long data) {
         struct sectorlist* newNode = malloc(sizeof(struct sectorlist));
         newNode->index = data;
         newNode->next = *headRef;
         *headRef = newNode; 
    }
    
    // Main Code
    int main(int argc, char * argv[]) 
    {
    	int rank;
    	int size;
    
    	FILE * pFile; 
    	FILE * pFile1;
    	FILE * pFilereal;
    	FILE * pFileimg;
    	char filename[1000];
    	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/old/data/gauss2_t201806301100_SBL180.binMS/";
    	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/newgauss2noconj_t201806301100_SBL180.binMS/";
    	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/gauss1_t201806301100_SBL180.binMS/";
    	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/";
    	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/gauss4_t201806301100_SBL180.binMS/";
    	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/hba-8hrs_t201806301100_SBH255i-test.binMS/";
    	//
    	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/hba-8hrs_gauss4new.binMS/";
    	//char datapath[900] = "/m100_scratch/userexternal/cgheller/Lofar/Observations/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/";
    	char datapath[900]; 
    	char datapath_multi[NFILES][900];
    
    	char ufile[30] = "ucoord.bin";
            char vfile[30] = "vcoord.bin";
            char wfile[30] = "wcoord.bin";
            char weightsfile[30] = "weights.bin";
            char visrealfile[30] = "visibilities_real.bin";
            char visimgfile[30] = "visibilities_img.bin";
            char metafile[30] = "meta.txt";
    	char outfile[30] = "grid.txt";
    	char outfile1[30] = "coords.txt";
    	char outfile2[30] = "grid_real.bin";
    	char outfile3[30] = "grid_img.bin";
    	char fftfile[30] = "fft.txt";
    	char fftfile2[30] = "fft_real.bin";
    	char fftfile3[30] = "fft_img.bin";
    	char logfile[30] = "run.log";
    	char extension[30] = ".txt";
    	char srank[4];
    
    	double * uu;
    	double * vv;
    	double * ww;
    	float * weights;
    	float * visreal;
    	float * visimg;
    
    	long Nmeasures;
    	long Nvis;
    	long Nweights;
    	long freq_per_chan;
    	long polarisations;
            long Ntimes;
    	double dt;
    	double thours;
    	long baselines;
    	double uvmin;
    	double uvmax;
    	double wmin;
    	double wmax;
    
            // MESH SIZE
    	int grid_size_x = 2048;
    	int grid_size_y = 2048;
    	int local_grid_size_x;// = 100;
    	int local_grid_size_y;// = 100;
    	int xaxis;
    	int yaxis;
    	int num_w_planes = 1;
    	// DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization
    	int w_support = 7;
    	int num_threads;// = 4;
    	double dx = 1.0/(double)grid_size_x;
    	double dw = 1.0/(double)num_w_planes;
    	double w_supporth = (double)((w_support-1)/2)*dx;
    
    	clock_t start, end, start0, startk, endk;
    	double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time;
    	double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1;
    	double writetime, writetime1;
    
    	struct timespec begin, finish, begin0, begink, finishk;
    	double elapsed;
    	long nsectors;
    
    	clock_gettime(CLOCK_MONOTONIC, &begin0);
    	start0 = clock();
            // Set the number of OpenMP threads
            num_threads = atoi(argv[1]);
    
    	// Intialize MPI environment
    #ifdef USE_MPI
    	MPI_Init(&argc,&argv);
    	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    	MPI_Comm_size(MPI_COMM_WORLD, &size);
    	if(rank == 0)printf("Running with %d MPI tasks\n",size);
      #ifdef USE_FFTW
    	fftw_mpi_init();
      #endif
    #else
    	rank = 0;
    	size = 1;
    #endif
            if(rank == 0)printf("Running with %d threads\n",num_threads);
    
    	// set the local size of the image
    	local_grid_size_x = grid_size_x;
    	nsectors = NUM_OF_SECTORS;
    	if (nsectors < 0) nsectors = size;
    	local_grid_size_y = grid_size_y/nsectors;
    	//nsectors = size;
    	
    	// LOCAL grid size
    	xaxis = local_grid_size_x;
    	yaxis = local_grid_size_y;
    
    	clock_gettime(CLOCK_MONOTONIC, &begin);
    	start = clock();
    
    	// INPUT FILES (only the first ndatasets entries are used)
    	int ndatasets = 1;
            strcpy(datapath_multi[0],"data/newgauss2noconj_t201806301100_SBL180.binMS/");
            //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/");
            //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
            //strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/");
    
    	strcpy(datapath,datapath_multi[0]);
    	// Read metadata
    	strcpy(filename,datapath);
    	strcat(filename,metafile);
    	pFile = fopen (filename,"r");
            fscanf(pFile,"%ld",&Nmeasures);
            fscanf(pFile,"%ld",&Nvis);
            fscanf(pFile,"%ld",&freq_per_chan);
            fscanf(pFile,"%ld",&polarisations);
            fscanf(pFile,"%ld",&Ntimes);
            fscanf(pFile,"%lf",&dt);
            fscanf(pFile,"%lf",&thours);
            fscanf(pFile,"%ld",&baselines);
            fscanf(pFile,"%lf",&uvmin);
            fscanf(pFile,"%lf",&uvmax);
            fscanf(pFile,"%lf",&wmin);
            fscanf(pFile,"%lf",&wmax);
    	fclose(pFile);
    
    	// WATCH THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    	int nsub = 1000;
    	//int nsub = 10;
    	printf("Subtracting last %d measurements\n",nsub);
            Nmeasures = Nmeasures-nsub;
    	Nvis = Nmeasures*freq_per_chan*polarisations;
    
            // calculate the coordinates of the center
    	double uvshift = uvmin/(uvmax-uvmin);
    	//printf("UVSHIFT %f %f %f %f %f\n",uvmin, uvmax, wmin, wmax, uvshift);
    
    	if (rank == 0)
    	{
    	   printf("N. measurements %ld\n",Nmeasures);
    	   printf("N. visibilities %ld\n",Nvis);
    	}
    
            // Set temporary local size of points
    	long nm_pe = (long)(Nmeasures/size);
    	long remaining = Nmeasures%size;
            
            long startrow = rank*nm_pe;
            if (rank == size-1)nm_pe = nm_pe+remaining;
    
    	long Nmeasures_tot = Nmeasures;
    	Nmeasures = nm_pe;
    	long Nvis_tot = Nvis;
    	Nvis = Nmeasures*freq_per_chan*polarisations;
    	Nweights = Nmeasures*polarisations;
    
            #ifdef VERBOSE
    	printf("N. measurements on %d %ld\n",rank,Nmeasures);
    	printf("N. visibilities on %d %ld\n",rank,Nvis);
            #endif
    
    	// DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used
    	// all the sizes are rescaled by the number of MPI tasks
    	// Allocate arrays
    	uu = (double*) calloc(Nmeasures,sizeof(double));
    	vv = (double*) calloc(Nmeasures,sizeof(double));
    	ww = (double*) calloc(Nmeasures,sizeof(double));
    	weights = (float*) calloc(Nweights,sizeof(float));
    	visreal = (float*) calloc(Nvis,sizeof(float));
    	visimg = (float*) calloc(Nvis,sizeof(float));
    
            if(rank == 0)printf("READING DATA\n");
    	// Read data
    	strcpy(filename,datapath);
    	strcat(filename,ufile);
    	//printf("Reading %s\n",filename);
    
    	pFile = fopen (filename,"rb");
    	fseek (pFile,startrow*sizeof(double),SEEK_SET);
    	fread(uu,Nmeasures*sizeof(double),1,pFile);
    	fclose(pFile);
    
    	strcpy(filename,datapath);
    	strcat(filename,vfile);
    	//printf("Reading %s\n",filename);
    
    	pFile = fopen (filename,"rb");
    	fseek (pFile,startrow*sizeof(double),SEEK_SET);
    	fread(vv,Nmeasures*sizeof(double),1,pFile);
    	fclose(pFile);
    
    	strcpy(filename,datapath);
    	strcat(filename,wfile);
    	//printf("Reading %s\n",filename);
    
    	pFile = fopen (filename,"rb");
    	fseek (pFile,startrow*sizeof(double),SEEK_SET);
    	fread(ww,Nmeasures*sizeof(double),1,pFile);
    	fclose(pFile);
    
            #ifdef USE_MPI
    	MPI_Barrier(MPI_COMM_WORLD);
            #endif
    
    	clock_gettime(CLOCK_MONOTONIC, &finish);
    	end = clock();
    	setup_time = ((double) (end - start)) / CLOCKS_PER_SEC;
    	setup_time1 = (finish.tv_sec - begin.tv_sec);
    	setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
    
            if(rank == 0)printf("GRIDDING DATA\n");
    
    	// Create histograms and linked lists
            clock_gettime(CLOCK_MONOTONIC, &begin);
            start = clock();
     
            //CLAAA
    	// Initialize linked list
    	struct sectorlist ** sectorhead;
    	sectorhead = (struct sectorlist **) malloc((nsectors+1) * sizeof(struct sectorlist));
    	for (int isec=0; isec<=nsectors; isec++)
    	{
    		sectorhead[isec] = malloc(sizeof(struct sectorlist));
    		sectorhead[isec]->index = -1;
    		sectorhead[isec]->next = NULL;
    	}
    
    
    	long * histo_send = (long*) calloc(nsectors+1,sizeof(long));
    	int * boundary = (int*) calloc(Nmeasures,sizeof(int));
    	double uuh,vvh;
    	for (long iphi = 0; iphi < Nmeasures; iphi++)
    	{
               boundary[iphi] = -1;
    	   uuh = uu[iphi];
    	   vvh = vv[iphi];
    	   int binphi = (int)(vvh*nsectors);
    	   // check if the point influence also neighboring slabs
    	   double updist = (double)((binphi+1)*yaxis)*dx - vvh;
    	   double downdist = vvh - (double)(binphi*yaxis)*dx;
    	   //
    	   histo_send[binphi]++;
               Push(&sectorhead[binphi],iphi);
               if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1; Push(&sectorhead[binphi+1],iphi);};
    	   if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) {histo_send[binphi-1]++; boundary[iphi] = binphi-1; Push(&sectorhead[binphi-1],iphi);};
    	}
    
    #ifdef PIPPO
    	struct sectorlist * current;
    	long iiii = 0;
    	for (int j=0; j<nsectors; j++)
    	{
    		current = sectorhead[j];
    		iiii = 0;
    		while (current->index != -1)
    		{
    			printf("%d %d %ld %ld %ld\n",rank,j,iiii,histo_send[j],current->index);
    			current = current->next;
    			iiii++;
    		}
    	}
    #endif
    
            #ifdef VERBOSE
            for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]);
            #endif
    
           // Create sector grid
            double * gridss;
            double * gridss_w;
            double * gridss_real;
            double * gridss_img;
    	double * grid;
            long size_of_grid;
            size_of_grid = 2*num_w_planes*xaxis*yaxis;
            gridss = (double*) calloc(size_of_grid,sizeof(double));
            gridss_w = (double*) calloc(size_of_grid,sizeof(double));
            gridss_real = (double*) calloc(size_of_grid/2,sizeof(double));
            gridss_img = (double*) calloc(size_of_grid/2,sizeof(double));
            // Create destination slab
            grid = (double*) calloc(size_of_grid,sizeof(double));
            // Create temporary global grid
            #ifndef USE_MPI
            double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
            #endif
    
            double shift = (double)(dx*yaxis);
    
            // Open the MPI Memory Window for the slab
            #ifdef USE_MPI
            MPI_Win slabwin;
            MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
            MPI_Win_fence(0,slabwin);
            #endif
    
            #ifndef USE_MPI
            pFile1 = fopen (outfile1,"w");
            #endif
    
    	// loop over files
    	//
    	kernel_time = 0.0;
    	kernel_time1 = 0.0;
    	reduce_time = 0.0;
    	reduce_time1 = 0.0;
    	compose_time = 0.0;
    	compose_time1 = 0.0;
    	for (int ifiles=0; ifiles<ndatasets; ifiles++)
    	{
    
    	strcpy(filename,datapath_multi[ifiles]);
            printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets);
            strcat(filename,weightsfile);
    
            pFile = fopen (filename,"rb");
            fseek (pFile,startrow*polarisations*sizeof(float),SEEK_SET);
            fread(weights,(Nweights)*sizeof(float),1,pFile);
            fclose(pFile);
    
            strcpy(filename,datapath);
            strcat(filename,visrealfile);
            //printf("Reading %s\n",filename);
    
            pFile = fopen (filename,"rb");
            fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
            fread(visreal,Nvis*sizeof(float),1,pFile);
            fclose(pFile);
    
            strcpy(filename,datapath);
            strcat(filename,visimgfile);
            //printf("Reading %s\n",filename);
    
            pFile = fopen (filename,"rb");
            fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
            fread(visimg,Nvis*sizeof(float),1,pFile);
            fclose(pFile);
    
            #ifdef USE_MPI
            MPI_Barrier(MPI_COMM_WORLD);
            #endif
    
            // Declare temporary arrays for the masking
            double * uus;
            double * vvs;
            double * wws;
            float * visreals;
            float * visimgs;
            float * weightss;
    
    	long isector;
            for (long isector_count=0; isector_count<nsectors; isector_count++)
            {
              clock_gettime(CLOCK_MONOTONIC, &begink);
              startk = clock();
              // define local destination sector
              //isector = (isector_count+rank)%size;
              isector = isector_count;
    	  // allocate sector arrays     
              long Nsec = histo_send[isector];
    	  uus = (double*) malloc(Nsec*sizeof(double));
    	  vvs = (double*) malloc(Nsec*sizeof(double));
    	  wws = (double*) malloc(Nsec*sizeof(double));
    	  long Nweightss = Nsec*polarisations;
    	  long Nvissec = Nweightss*freq_per_chan;
    	  weightss = (float*) malloc(Nweightss*sizeof(float));
    	  visreals = (float*) malloc(Nvissec*sizeof(float));
    	  visimgs = (float*) malloc(Nvissec*sizeof(float));
    
    	  // select data for this sector
              long icount = 0;
    	  long ip = 0;
    	  long inu = 0;
    //CLAAAA
    	  struct sectorlist * current;
    	  current = sectorhead[isector];
    	  while (current->index != -1)
              {
                 long ilocal = current->index;
    	     //double vvh = vv[ilocal];
                 //int binphi = (int)(vvh*nsectors);
    	     //if (binphi == isector || boundary[ilocal] == isector)
    	     //{
    	       uus[icount] = uu[ilocal];
    	       vvs[icount] = vv[ilocal]-isector*shift;
    	       wws[icount] = ww[ilocal];
                   for (long ipol=0; ipol<polarisations; ipol++)
    	       {
    		  weightss[ip] = weights[ilocal*polarisations+ipol];
                      ip++;
    	       }
                   for (long ifreq=0; ifreq<polarisations*freq_per_chan; ifreq++)
    	       {
    	          visreals[inu] = visreal[ilocal*polarisations*freq_per_chan+ifreq];
    	          visimgs[inu] = visimg[ilocal*polarisations*freq_per_chan+ifreq];
    		  //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*polarisations*freq_per_chan+ifreq,Nvis);
    	          inu++;
    	       }
    	       icount++;
    	     //}
    	     current = current->next;
              }
    
    	  clock_gettime(CLOCK_MONOTONIC, &finishk);
    	  endk = clock();
    	  compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
    	  compose_time1 += (finishk.tv_sec - begink.tv_sec);
    	  compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
              #ifndef USE_MPI
    	  double uumin = 1e20;
    	  double vvmin = 1e20;
    	  double uumax = -1e20;
    	  double vvmax = -1e20;
    
              for (long ipart=0; ipart<Nsec; ipart++)
              {
    	       uumin = MIN(uumin,uus[ipart]);
    	       uumax = MAX(uumax,uus[ipart]);
    	       vvmin = MIN(vvmin,vvs[ipart]);
    	       vvmax = MAX(vvmax,vvs[ipart]);
    
    		  
                   if(ipart%10 == 0)fprintf (pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]);
              }
    
    	  printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
              #endif
    
              // Make convolution on the grid
              #ifdef VERBOSE
    	  printf("Processing sector %ld\n",isector);
    	  #endif
              clock_gettime(CLOCK_MONOTONIC, &begink);
              startk = clock();
              wstack(num_w_planes,
                   Nsec,
                   freq_per_chan,
                   polarisations,
                   uus,
                   vvs,
                   wws,
                   visreals,
                   visimgs,
                   weightss,
                   dx,
                   dw,
                   w_support,
    	       xaxis,
    	       yaxis,
                   gridss,
                   num_threads);
    	  clock_gettime(CLOCK_MONOTONIC, &finishk);
    	  endk = clock();
    	  kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
    	  kernel_time1 += (finishk.tv_sec - begink.tv_sec);
    	  kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
              #ifdef VERBOSE
    	  printf("Processed sector %ld\n",isector);
              #endif
              clock_gettime(CLOCK_MONOTONIC, &begink);
              startk = clock();
    
              //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]);
    
              #ifndef USE_MPI
    	  long stride = isector*2*xaxis*yaxis*num_w_planes;
              for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)gridtot[stride+iii] = gridss[iii];
    	  #endif
    
    	  // Write grid in the corresponding remote slab
              #ifdef USE_MPI
    	  int target_rank = (int)isector;
    	  //int target_rank = (int)(size-isector-1);
    	  #ifdef ONE_SIDE
    	  printf("One Side communication active\n");
    	  MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin);
    	  MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); 
    	  MPI_Win_unlock(target_rank,slabwin);
    	  //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin); 	   
              #else
              MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
              #endif //ONE_SIDE
              #endif //USE_MPI
    
    	  clock_gettime(CLOCK_MONOTONIC, &finishk);
    	  endk = clock();
    	  reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
    	  reduce_time1 += (finishk.tv_sec - begink.tv_sec);
    	  reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
              // Go to next sector
    	  for (long inull=0; inull<2*num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0;
    
    	  // Deallocate all sector arrays
    	  free(uus);
    	  free(vvs);
    	  free(wws);
    	  free(weightss);
    	  free(visreals);
    	  free(visimgs);
            // End of loop over sectors
            }
    	// End of loop over input files
    	}
    
    	// Finalize MPI communication
            #ifdef USE_MPI
    	MPI_Win_fence(0,slabwin);
    	#endif
    
            // Swap left and right parts APPARENTLY NOT NECESSARY
    	/*
    	for (long kswap=0; kswap<num_w_planes; kswap++)
    	for (long jswap=0; jswap<yaxis; jswap++)
    	for (long iswap=0; iswap<xaxis; iswap++)
    	{
               long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis);
    	   gridss[index_origin] = grid[index_origin];
    	   gridss[index_origin+1] = grid[index_origin+1];
    	}
    	for (long kswap=0; kswap<num_w_planes; kswap++)
    	for (long jswap=0; jswap<yaxis; jswap++)
    	for (long iswap=0; iswap<xaxis/2; iswap++)
    	{
               long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis);
    	   long index_destination = 2*(iswap+xaxis/2 + jswap*xaxis + kswap*yaxis*xaxis);
    	   grid[index_destination] = gridss[index_origin];
    	   grid[index_destination+1] = gridss[index_origin+1];
    	}
    
    	for (long kswap=0; kswap<num_w_planes; kswap++)
    	for (long jswap=0; jswap<yaxis; jswap++)
    	for (long iswap=xaxis/2; iswap<xaxis; iswap++)
    	{
               long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis);
    	   long index_destination = 2*(iswap-xaxis/2 + jswap*xaxis + kswap*yaxis*xaxis);
    	   grid[index_destination] = gridss[index_origin];
    	   grid[index_destination+1] = gridss[index_origin+1];
    	}
    	*/
    
            #ifndef USE_MPI
            fclose(pFile1);
            #endif
            #ifdef USE_MPI
    	MPI_Barrier(MPI_COMM_WORLD);
    	#endif
    
            end = clock();
            clock_gettime(CLOCK_MONOTONIC, &finish);
            process_time = ((double) (end - start)) / CLOCKS_PER_SEC;
            process_time1 = (finish.tv_sec - begin.tv_sec);
            process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
            clock_gettime(CLOCK_MONOTONIC, &begin);
             
    
    #ifdef WRITE_DATA
    	// Write results
    	if (rank == 0)
    	{
              printf("WRITING GRIDDED DATA\n");
              pFilereal = fopen (outfile2,"wb");
              pFileimg = fopen (outfile3,"wb");
    	  #ifdef USE_MPI
    	  for (int isector=0; isector<nsectors; isector++)
              {
    	      MPI_Win_lock(MPI_LOCK_SHARED,isector,0,slabwin);
    	      MPI_Get(gridss,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,slabwin);
    	      MPI_Win_unlock(isector,slabwin);
    	      for (long i=0; i<size_of_grid/2; i++)
    	      {
    		      gridss_real[i] = gridss[2*i];
    		      gridss_img[i] = gridss[2*i+1];
    	      }
    	      if (num_w_planes > 1)
    	      {
                    for (int iw=0; iw<num_w_planes; iw++)
                    for (int iv=0; iv<yaxis; iv++)
                    for (int iu=0; iu<xaxis; iu++)
                    {
    			  long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
                              long index = iu + iv*xaxis + iw*xaxis*yaxis;
    			  fseek(pFilereal, global_index, SEEK_SET);
    			  fwrite(&gridss_real[index], 1, sizeof(double), pFilereal);
                    }
                    for (int iw=0; iw<num_w_planes; iw++)
                    for (int iv=0; iv<yaxis; iv++)
                    for (int iu=0; iu<xaxis; iu++)
                    {
                              long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
                              long index = iu + iv*xaxis + iw*xaxis*yaxis;
                              fseek(pFileimg, global_index, SEEK_SET);
                              fwrite(&gridss_img[index], 1, sizeof(double), pFileimg);
                              //double v_norm = sqrt(gridss[index]*gridss[index]+gridss[index+1]*gridss[index+1]);
                              //fprintf (pFile, "%d %d %d %f %f %f\n", iu,isector*yaxis+iv,iw,gridss[index],gridss[index+1],v_norm);
                    }
    
    	      } else {
    		for (int iw=0; iw<num_w_planes; iw++)
    		{
    			  long global_index = (xaxis*isector*yaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
                              long index = iw*xaxis*yaxis;
    			  fseek(pFilereal, global_index, SEEK_SET);
                              fwrite(&gridss_real[index], xaxis*yaxis, sizeof(double), pFilereal);
                              fseek(pFileimg, global_index, SEEK_SET);
                              fwrite(&gridss_img[index], xaxis*yaxis, sizeof(double), pFileimg);
    		}
    	      }
              }
    	  #else
              for (int iw=0; iw<num_w_planes; iw++)
                for (int iv=0; iv<grid_size_y; iv++)
                   for (int iu=0; iu<grid_size_x; iu++)
                   {
                              long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y);
    		          fwrite(&gridtot[index], 1, sizeof(double), pFilereal);
    		          fwrite(&gridtot[index+1], 1, sizeof(double), pFileimg);
                              //double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
                              //fprintf (pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
                   }
              #endif
              fclose(pFilereal);
              fclose(pFileimg);
    	}
    
            #ifdef USE_MPI
            MPI_Win_fence(0,slabwin);
            #endif
    #endif //WRITE_DATA
    
    
    #ifdef USE_FFTW
    	// FFT transform the data (using distributed FFTW)
    	
    	if(rank == 0)printf("PERFORMING FFT\n");
            clock_gettime(CLOCK_MONOTONIC, &begin);
            start = clock();
            fftw_plan plan;
            fftw_complex *fftwgrid;
    	ptrdiff_t alloc_local, local_n0, local_0_start;
    	double norm = 1.0/(double)(grid_size_x*grid_size_y);
    
            // map the 1D array of complex visibilities to a 2D array required by FFTW (complex[*][2])
    	// x is the direction of contiguous data and maps to the second parameter
    	// y is the parallelized direction and corresponds to the first parameter (--> n0)
    	// and perform the FFT per w plane
    	alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start);
    	fftwgrid = fftw_alloc_complex(alloc_local);
    	plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); 
    
    	long fftwindex = 0;
    	long fftwindex2D = 0;
    	for (int iw=0; iw<num_w_planes; iw++)
            {
                //printf("FFTing plan %d\n",iw);	    
                // select the w-plane to transform
                for (int iv=0; iv<yaxis; iv++)
                {
                   for (int iu=0; iu<xaxis; iu++)
                   {
    		   fftwindex2D = iu + iv*xaxis;
    		   fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
                       fftwgrid[fftwindex2D][0] = grid[fftwindex];
                       fftwgrid[fftwindex2D][1] = grid[fftwindex+1];
    	       }
    	    }
    
                // do the transform for each w-plane
    	    fftw_execute(plan);
    
    	    // save the transformed w-plane
                for (int iv=0; iv<yaxis; iv++)
                {
                   for (int iu=0; iu<xaxis; iu++)
                   {
    		   fftwindex2D = iu + iv*xaxis;
    		   fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
                       gridss[fftwindex] = norm*fftwgrid[fftwindex2D][0];
                       gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D][1];
                   }
                }
    
    	}
    
            fftw_destroy_plan(plan);
    
            #ifdef USE_MPI
            MPI_Win_fence(0,slabwin);
    	MPI_Barrier(MPI_COMM_WORLD);
            #endif
    
            end = clock();
            clock_gettime(CLOCK_MONOTONIC, &finish);
            fftw_time = ((double) (end - start)) / CLOCKS_PER_SEC;
            fftw_time1 = (finish.tv_sec - begin.tv_sec);
            fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
            clock_gettime(CLOCK_MONOTONIC, &begin);
    
    #ifdef WRITE_DATA
            // Write results
            #ifdef USE_MPI
    	MPI_Win writewin;
            MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin);
    	MPI_Win_fence(0,writewin);
            #endif
            if (rank == 0)
            {
              printf("WRITING FFT TRANSFORMED DATA\n");
              pFilereal = fopen (fftfile2,"wb");
              pFileimg = fopen (fftfile3,"wb");
              #ifdef USE_MPI
              for (int isector=0; isector<nsectors; isector++)
              {
                  MPI_Win_lock(MPI_LOCK_SHARED,isector,0,writewin);
                  MPI_Get(gridss_w,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,writewin);
                  MPI_Win_unlock(isector,writewin);
    	      for (long i=0; i<size_of_grid/2; i++)
    	      {
    		      gridss_real[i] = gridss_w[2*i];
    		      gridss_img[i] = gridss_w[2*i+1];
    	      }
    	      if (num_w_planes > 1)
    	      {
                    for (int iw=0; iw<num_w_planes; iw++)
                    for (int iv=0; iv<yaxis; iv++)
                    for (int iu=0; iu<xaxis; iu++)
                    {
    			  long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
                              long index = iu + iv*xaxis + iw*xaxis*yaxis;
    			  fseek(pFilereal, global_index, SEEK_SET);
    			  fwrite(&gridss_real[index], 1, sizeof(double), pFilereal);
                    }
                    for (int iw=0; iw<num_w_planes; iw++)
                    for (int iv=0; iv<yaxis; iv++)
                    for (int iu=0; iu<xaxis; iu++)
                    {         
                              long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
                              long index = iu + iv*xaxis + iw*xaxis*yaxis;
                              fseek(pFileimg, global_index, SEEK_SET);
                              fwrite(&gridss_img[index], 1, sizeof(double), pFileimg);
                    }
                  } else {
                              fwrite(gridss_real, size_of_grid/2, sizeof(double), pFilereal);
                              fwrite(gridss_img, size_of_grid/2, sizeof(double), pFileimg);
                  }
    
    	      
              }
              #else
              /*
    	  for (int iw=0; iw<num_w_planes; iw++)
                for (int iv=0; iv<grid_size_y; iv++)
                   for (int iu=0; iu<grid_size_x; iu++)
                   {
                              int isector = 0;
                              long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y);
                              double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
                              fprintf (pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
                   }
    	  */
              #endif
              fclose(pFilereal);
              fclose(pFileimg);
            }
    
            #ifdef USE_MPI
            //MPI_Win_fence(0,writewin);
            MPI_Win_fence(0,writewin);
    	MPI_Win_free(&writewin);
    	MPI_Barrier(MPI_COMM_WORLD);
            #endif
    #endif //WRITE_DATA
     
    	fftw_free(fftwgrid);
    
    	// Phase correction
            clock_gettime(CLOCK_MONOTONIC, &begin);
            start = clock();
    	if(rank == 0)printf("PHASE CORRECTION\n");
            double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));	
            double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));	
            phase_correction(gridss,image_real,image_imag,xaxis,yaxis,num_w_planes,grid_size_x,grid_size_y);
    
            end = clock();
            clock_gettime(CLOCK_MONOTONIC, &finish);
            phase_time = ((double) (end - start)) / CLOCKS_PER_SEC;
            phase_time1 = (finish.tv_sec - begin.tv_sec);
            phase_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
    #ifdef WRITE_IMAGE
    
            if(rank == 0)
            {
                pFilereal = fopen (fftfile2,"wb");
                pFileimg = fopen (fftfile3,"wb");
                fclose(pFilereal);
                fclose(pFileimg);
            }
    	#ifdef USE_MPI
    	MPI_Barrier(MPI_COMM_WORLD);
            #endif
            if(rank == 0)printf("WRITING IMAGE\n");
    	for (int isector=0; isector<size; isector++)
    	{
    	    #ifdef USE_MPI
    	    MPI_Barrier(MPI_COMM_WORLD);
                #endif
    	    if(isector == rank)
                {
    	       printf("%d writing\n",isector);
                   pFilereal = fopen (fftfile2,"ab");
                   pFileimg = fopen (fftfile3,"ab");
    
    	       long global_index = isector*(xaxis*yaxis)*sizeof(double);
    
                   fseek(pFilereal, global_index, SEEK_SET);
                   fwrite(image_real, xaxis*yaxis, sizeof(double), pFilereal);
                   fseek(pFileimg, global_index, SEEK_SET);
                   fwrite(image_imag, xaxis*yaxis, sizeof(double), pFileimg);
    
                   fclose(pFilereal);
                   fclose(pFileimg);
    	    }
    	}
    	#ifdef USE_MPI
    	MPI_Barrier(MPI_COMM_WORLD);
            #endif
    
    #endif //WRITE_IMAGE
    
    
    #endif //USE_FFTW
    
    	end = clock();
    	clock_gettime(CLOCK_MONOTONIC, &finish);
    	tot_time = ((double) (end - start0)) / CLOCKS_PER_SEC;
    	tot_time1 = (finish.tv_sec - begin0.tv_sec);
    	tot_time1 += (finish.tv_nsec - begin0.tv_nsec) / 1000000000.0;
    
            if (rank == 0)
            {
              printf("Setup time:    %f sec\n",setup_time);
              printf("Process time:  %f sec\n",process_time);
              printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",kernel_time,compose_time,reduce_time);
    #ifdef USE_FFTW
              printf("FFTW time:     %f sec\n",fftw_time);
              printf("Phase time:    %f sec\n",phase_time);
    #endif
              printf("TOT time:      %f sec\n",tot_time);
    	  if(num_threads > 1)
              {
                printf("PSetup time:   %f sec\n",setup_time1);
                printf("PProcess time: %f sec\n",process_time1);
                printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",kernel_time1,compose_time1,reduce_time1);
    #ifdef USE_FFTW
                printf("PFFTW time:    %f sec\n",fftw_time1);
                printf("PPhase time:   %f sec\n",phase_time1);
    #endif
                printf("PTOT time:     %f sec\n",tot_time1);
    	  }
            }
    
    
    	// Close MPI environment
    	#ifdef USE_MPI
            MPI_Win_fence(0,slabwin);
    	MPI_Win_free(&slabwin);
    	MPI_Finalize();
    	#endif
    }