diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..f109a98c7a1a1cbe5d406752091814495af52068 --- /dev/null +++ b/Makefile @@ -0,0 +1,47 @@ +OPT += -DUSE_MPI +OPT += -DUSE_FFTW +OPT += -DONE_SIDE +OPT += -DWRITE_DATA + +CC = gcc +CXX = g++ +MPICC = mpicc +MPICXX =mpiCC + +CFLAGS += -O3 -mcpu=native +CFLAGS += -I. +LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm + +NVCC = nvcc +NVFLAGS = -arch=sm_70 -c w-stacking.cu -Xcompiler -mno-float128 -std=c++11 +NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda + +DEPS = w-stacking.h +COBJ = w-stacking.o w-stacking-fftw.o + +w-stacking.c: + cp w-stacking.cu w-stacking.c + +%.o: %.c $(DEPS) + $(CC) -c -o $@ $< $(CFLAGS) $(OPT) + +serial: $(COBJ) + $(CC) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm + +serial_cuda: + $(NVCC) $(NVFLAGS) -c w-stacking.cu $(NVLIB) + $(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c + $(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o $(NVLIB) -lm + +mpi: $(COBJ) + $(MPICC) -o w-stackingCfftw $(CFLAGS) $^ $(LIBS) + +mpi_cuda: + $(NVCC) $(NVFLAGS) -c w-stacking.cu $(NVLIB) + $(MPICC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c + $(MPICXX) $(CFLAGS) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o $(NVLIB) $(LIBS) -lm + +clean: + rm *.o + rm w-stacking.c + diff --git a/peano.c b/peano.c new file mode 100644 index 0000000000000000000000000000000000000000..dc831e82253fa928570b17f736e419c36b6bbad3 --- /dev/null +++ b/peano.c @@ -0,0 +1,130 @@ +#include "peano.h" + +//typedef unsigned long long peanokey; + +/* The following rewrite of the original function + * peano_hilbert_key_old() has been written by MARTIN REINECKE. + * It is about a factor 2.3 - 2.5 faster than Volker's old routine! + */ +const unsigned char rottable3[48][8] = { + {36, 28, 25, 27, 10, 10, 25, 27}, + {29, 11, 24, 24, 37, 11, 26, 26}, + {8, 8, 25, 27, 30, 38, 25, 27}, + {9, 39, 24, 24, 9, 31, 26, 26}, + {40, 24, 44, 32, 40, 6, 44, 6}, + {25, 7, 33, 7, 41, 41, 45, 45}, + {4, 42, 4, 46, 26, 42, 34, 46}, + {43, 43, 47, 47, 5, 27, 5, 35}, + {33, 35, 36, 28, 33, 35, 2, 2}, + {32, 32, 29, 3, 34, 34, 37, 3}, + {33, 35, 0, 0, 33, 35, 30, 38}, + {32, 32, 1, 39, 34, 34, 1, 31}, + {24, 42, 32, 46, 14, 42, 14, 46}, + {43, 43, 47, 47, 25, 15, 33, 15}, + {40, 12, 44, 12, 40, 26, 44, 34}, + {13, 27, 13, 35, 41, 41, 45, 45}, + {28, 41, 28, 22, 38, 43, 38, 22}, + {42, 40, 23, 23, 29, 39, 29, 39}, + {41, 36, 20, 36, 43, 30, 20, 30}, + {37, 31, 37, 31, 42, 40, 21, 21}, + {28, 18, 28, 45, 38, 18, 38, 47}, + {19, 19, 46, 44, 29, 39, 29, 39}, + {16, 36, 45, 36, 16, 30, 47, 30}, + {37, 31, 37, 31, 17, 17, 46, 44}, + {12, 4, 1, 3, 34, 34, 1, 3}, + {5, 35, 0, 0, 13, 35, 2, 2}, + {32, 32, 1, 3, 6, 14, 1, 3}, + {33, 15, 0, 0, 33, 7, 2, 2}, + {16, 0, 20, 8, 16, 30, 20, 30}, + {1, 31, 9, 31, 17, 17, 21, 21}, + {28, 18, 28, 22, 2, 18, 10, 22}, + {19, 19, 23, 23, 29, 3, 29, 11}, + {9, 11, 12, 4, 9, 11, 26, 26}, + {8, 8, 5, 27, 10, 10, 13, 27}, + {9, 11, 24, 24, 9, 11, 6, 14}, + {8, 8, 25, 15, 10, 10, 25, 7}, + {0, 18, 8, 22, 38, 18, 38, 22}, + {19, 19, 23, 23, 1, 39, 9, 39}, + {16, 36, 20, 36, 16, 2, 20, 10}, + {37, 3, 37, 11, 17, 17, 21, 21}, + {4, 17, 4, 46, 14, 19, 14, 46}, + {18, 16, 47, 47, 5, 15, 5, 15}, + {17, 12, 44, 12, 19, 6, 44, 6}, + {13, 7, 13, 7, 18, 16, 45, 45}, + {4, 42, 4, 21, 14, 42, 14, 23}, + {43, 43, 22, 20, 5, 15, 5, 15}, + {40, 12, 21, 12, 40, 6, 23, 6}, + {13, 7, 13, 7, 41, 41, 22, 20} +}; + +const unsigned char subpix3[48][8] = { + {0, 7, 1, 6, 3, 4, 2, 5}, + {7, 4, 6, 5, 0, 3, 1, 2}, + {4, 3, 5, 2, 7, 0, 6, 1}, + {3, 0, 2, 1, 4, 7, 5, 6}, + {1, 0, 6, 7, 2, 3, 5, 4}, + {0, 3, 7, 4, 1, 2, 6, 5}, + {3, 2, 4, 5, 0, 1, 7, 6}, + {2, 1, 5, 6, 3, 0, 4, 7}, + {6, 1, 7, 0, 5, 2, 4, 3}, + {1, 2, 0, 3, 6, 5, 7, 4}, + {2, 5, 3, 4, 1, 6, 0, 7}, + {5, 6, 4, 7, 2, 1, 3, 0}, + {7, 6, 0, 1, 4, 5, 3, 2}, + {6, 5, 1, 2, 7, 4, 0, 3}, + {5, 4, 2, 3, 6, 7, 1, 0}, + {4, 7, 3, 0, 5, 6, 2, 1}, + {6, 7, 5, 4, 1, 0, 2, 3}, + {7, 0, 4, 3, 6, 1, 5, 2}, + {0, 1, 3, 2, 7, 6, 4, 5}, + {1, 6, 2, 5, 0, 7, 3, 4}, + {2, 3, 1, 0, 5, 4, 6, 7}, + {3, 4, 0, 7, 2, 5, 1, 6}, + {4, 5, 7, 6, 3, 2, 0, 1}, + {5, 2, 6, 1, 4, 3, 7, 0}, + {7, 0, 6, 1, 4, 3, 5, 2}, + {0, 3, 1, 2, 7, 4, 6, 5}, + {3, 4, 2, 5, 0, 7, 1, 6}, + {4, 7, 5, 6, 3, 0, 2, 1}, + {6, 7, 1, 0, 5, 4, 2, 3}, + {7, 4, 0, 3, 6, 5, 1, 2}, + {4, 5, 3, 2, 7, 6, 0, 1}, + {5, 6, 2, 1, 4, 7, 3, 0}, + {1, 6, 0, 7, 2, 5, 3, 4}, + {6, 5, 7, 4, 1, 2, 0, 3}, + {5, 2, 4, 3, 6, 1, 7, 0}, + {2, 1, 3, 0, 5, 6, 4, 7}, + {0, 1, 7, 6, 3, 2, 4, 5}, + {1, 2, 6, 5, 0, 3, 7, 4}, + {2, 3, 5, 4, 1, 0, 6, 7}, + {3, 0, 4, 7, 2, 1, 5, 6}, + {1, 0, 2, 3, 6, 7, 5, 4}, + {0, 7, 3, 4, 1, 6, 2, 5}, + {7, 6, 4, 5, 0, 1, 3, 2}, + {6, 1, 5, 2, 7, 0, 4, 3}, + {5, 4, 6, 7, 2, 3, 1, 0}, + {4, 3, 7, 0, 5, 2, 6, 1}, + {3, 2, 0, 1, 4, 5, 7, 6}, + {2, 5, 1, 6, 3, 4, 0, 7} +}; + +/*! This function computes a Peano-Hilbert key for an integer triplet (x,y,z), + * with x,y,z in the range between 0 and 2^bits-1. + */ +peanokey peano_hilbert_key(int x, int y, int z, int bits) +{ + int mask; + unsigned char rotation = 0; + peanokey key = 0; + + for(mask = 1 << (bits - 1); mask > 0; mask >>= 1) + { + unsigned char pix = ((x & mask) ? 4 : 0) | ((y & mask) ? 2 : 0) | ((z & mask) ? 1 : 0); + + key <<= 3; + key |= subpix3[rotation][pix]; + rotation = rottable3[rotation][pix]; + } + + return key; +} diff --git a/peano.h b/peano.h new file mode 100644 index 0000000000000000000000000000000000000000..465ac263aea790dbc9d13619deba287aad6f9dc4 --- /dev/null +++ b/peano.h @@ -0,0 +1,3 @@ +typedef unsigned long long peanokey; + +peanokey peano_hilbert_key(int, int, int, int); diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c new file mode 100644 index 0000000000000000000000000000000000000000..4088543ba021d22d28c5540bd99b5f0b2a9de958 --- /dev/null +++ b/w-stacking-fftw.c @@ -0,0 +1,860 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#ifdef USE_MPI +#include <mpi.h> +#ifdef USE_FFTW +#include <fftw3-mpi.h> +#endif +#endif +#include <math.h> +#include <time.h> +#include "w-stacking.h" +#define PI 3.14159265359 +#define NUM_OF_SECTORS -1 +#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) +#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) +#define NOVERBOSE +#define NFILES 10 + +// Linked List set-up +struct sectorlist { + long index; + struct sectorlist * next; +}; + +void Push(struct sectorlist** headRef, long data) { + struct sectorlist* newNode = malloc(sizeof(struct sectorlist)); + newNode->index = data; + newNode->next = *headRef; + *headRef = newNode; +} + +// Main Code +int main(int argc, char * argv[]) +{ + int rank; + int size; + + FILE * pFile; + FILE * pFile1; + FILE * pFilereal; + FILE * pFileimg; + char filename[1000]; + //char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/old/data/gauss2_t201806301100_SBL180.binMS/"; + //char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/newgauss2noconj_t201806301100_SBL180.binMS/"; + //char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/gauss1_t201806301100_SBL180.binMS/"; + //char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/"; + //char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/gauss4_t201806301100_SBL180.binMS/"; + //char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/hba-8hrs_t201806301100_SBH255i-test.binMS/"; + // + //char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/hba-8hrs_gauss4new.binMS/"; + //char datapath[900] = "/m100_scratch/userexternal/cgheller/Lofar/Observations/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"; + char datapath[900]; + char datapath_multi[NFILES][900]; + + char ufile[30] = "ucoord.bin"; + char vfile[30] = "vcoord.bin"; + char wfile[30] = "wcoord.bin"; + char weightsfile[30] = "weights.bin"; + char visrealfile[30] = "visibilities_real.bin"; + char visimgfile[30] = "visibilities_img.bin"; + char metafile[30] = "meta.txt"; + char outfile[30] = "grid.txt"; + char outfile1[30] = "coords.txt"; + char outfile2[30] = "grid_real.bin"; + char outfile3[30] = "grid_img.bin"; + char fftfile[30] = "fft.txt"; + char fftfile2[30] = "fft_real.bin"; + char fftfile3[30] = "fft_img.bin"; + char logfile[30] = "run.log"; + char extension[30] = ".txt"; + char srank[4]; + + double * uu; + double * vv; + double * ww; + float * weights; + float * visreal; + float * visimg; + + long Nmeasures; + long Nvis; + long Nweights; + long freq_per_chan; + long polarisations; + long Ntimes; + double dt; + double thours; + long baselines; + double uvmin; + double uvmax; + double wmin; + double wmax; + + + int grid_size_x = 2048; + int grid_size_y = 2048; + int local_grid_size_x;// = 100; + int local_grid_size_y;// = 100; + int xaxis; + int yaxis; + int num_w_planes = 1; + // DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization + int w_support = 7; + int num_threads;// = 4; + double dx = 1.0/(double)grid_size_x; + double dw = 1.0/(double)num_w_planes; + double w_supporth = (double)((w_support-1)/2)*dx; + + clock_t start, end, start0, startk, endk; + double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time; + double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1; + double writetime, writetime1; + + struct timespec begin, finish, begin0, begink, finishk; + double elapsed; + long nsectors; + + clock_gettime(CLOCK_MONOTONIC, &begin0); + start0 = clock(); + // Set the number of OpenMP threads + num_threads = atoi(argv[1]); + + // Intialize MPI environment +#ifdef USE_MPI + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + if(rank == 0)printf("Running with %d MPI tasks\n",size); + #ifdef USE_FFTW + fftw_mpi_init(); + #endif +#else + rank = 0; + size = 1; +#endif + if(rank == 0)printf("Running with %d threads\n",num_threads); + + // set the local size of the image + local_grid_size_x = grid_size_x; + nsectors = NUM_OF_SECTORS; + if (nsectors < 0) nsectors = size; + local_grid_size_y = grid_size_y/nsectors; + //nsectors = size; + + // LOCAL grid size + xaxis = local_grid_size_x; + yaxis = local_grid_size_y; + + clock_gettime(CLOCK_MONOTONIC, &begin); + start = clock(); + + // CLAAAAAA + int ndatasets = 2; + strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"); + strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/"); + + strcpy(datapath,datapath_multi[0]); + // Read metadata + strcpy(filename,datapath); + strcat(filename,metafile); + pFile = fopen (filename,"r"); + fscanf(pFile,"%ld",&Nmeasures); + fscanf(pFile,"%ld",&Nvis); + fscanf(pFile,"%ld",&freq_per_chan); + fscanf(pFile,"%ld",&polarisations); + fscanf(pFile,"%ld",&Ntimes); + fscanf(pFile,"%lf",&dt); + fscanf(pFile,"%lf",&thours); + fscanf(pFile,"%ld",&baselines); + fscanf(pFile,"%lf",&uvmin); + fscanf(pFile,"%lf",&uvmax); + fscanf(pFile,"%lf",&wmin); + fscanf(pFile,"%lf",&wmax); + fclose(pFile); + + // WATCH THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + int nsub = 1000; + //int nsub = 10; + printf("Subtracting last %d measurements\n",nsub); + Nmeasures = Nmeasures-nsub; + Nvis = Nmeasures*freq_per_chan*polarisations; + + // calculate the coordinates of the center + double uvshift = uvmin/(uvmax-uvmin); + //printf("UVSHIFT %f %f %f %f %f\n",uvmin, uvmax, wmin, wmax, uvshift); + + if (rank == 0) + { + printf("N. measurements %ld\n",Nmeasures); + printf("N. visibilities %ld\n",Nvis); + } + + // Set temporary local size of points + long nm_pe = (long)(Nmeasures/size); + long remaining = Nmeasures%size; + + long startrow = rank*nm_pe; + if (rank == size-1)nm_pe = nm_pe+remaining; + + long Nmeasures_tot = Nmeasures; + Nmeasures = nm_pe; + long Nvis_tot = Nvis; + Nvis = Nmeasures*freq_per_chan*polarisations; + Nweights = Nmeasures*polarisations; + + #ifdef VERBOSE + printf("N. measurements on %d %ld\n",rank,Nmeasures); + printf("N. visibilities on %d %ld\n",rank,Nvis); + #endif + + // DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used + // all the sizes are rescaled by the number of MPI tasks + // Allocate arrays + uu = (double*) calloc(Nmeasures,sizeof(double)); + vv = (double*) calloc(Nmeasures,sizeof(double)); + ww = (double*) calloc(Nmeasures,sizeof(double)); + weights = (float*) calloc(Nweights,sizeof(float)); + visreal = (float*) calloc(Nvis,sizeof(float)); + visimg = (float*) calloc(Nvis,sizeof(float)); + + if(rank == 0)printf("READING DATA\n"); + // Read data + strcpy(filename,datapath); + strcat(filename,ufile); + //printf("Reading %s\n",filename); + + pFile = fopen (filename,"rb"); + fseek (pFile,startrow*sizeof(double),SEEK_SET); + fread(uu,Nmeasures*sizeof(double),1,pFile); + fclose(pFile); + + strcpy(filename,datapath); + strcat(filename,vfile); + //printf("Reading %s\n",filename); + + pFile = fopen (filename,"rb"); + fseek (pFile,startrow*sizeof(double),SEEK_SET); + fread(vv,Nmeasures*sizeof(double),1,pFile); + fclose(pFile); + + strcpy(filename,datapath); + strcat(filename,wfile); + //printf("Reading %s\n",filename); + + pFile = fopen (filename,"rb"); + fseek (pFile,startrow*sizeof(double),SEEK_SET); + fread(ww,Nmeasures*sizeof(double),1,pFile); + fclose(pFile); + + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif + + clock_gettime(CLOCK_MONOTONIC, &finish); + end = clock(); + setup_time = ((double) (end - start)) / CLOCKS_PER_SEC; + setup_time1 = (finish.tv_sec - begin.tv_sec); + setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; + + if(rank == 0)printf("GRIDDING DATA\n"); + + // Create histograms and linked lists + clock_gettime(CLOCK_MONOTONIC, &begin); + start = clock(); + + //CLAAA + // Initialize linked list + struct sectorlist ** sectorhead; + sectorhead = (struct sectorlist **) malloc((nsectors+1) * sizeof(struct sectorlist)); + for (int isec=0; isec<=nsectors; isec++) + { + sectorhead[isec] = malloc(sizeof(struct sectorlist)); + sectorhead[isec]->index = -1; + sectorhead[isec]->next = NULL; + } + + + long * histo_send = (long*) calloc(nsectors+1,sizeof(long)); + int * boundary = (int*) calloc(Nmeasures,sizeof(int)); + double uuh,vvh; + for (long iphi = 0; iphi < Nmeasures; iphi++) + { + boundary[iphi] = -1; + uuh = uu[iphi]; + vvh = vv[iphi]; + int binphi = (int)(vvh*nsectors); + // check if the point influence also neighboring slabs + double updist = (double)((binphi+1)*yaxis)*dx - vvh; + double downdist = vvh - (double)(binphi*yaxis)*dx; + // + histo_send[binphi]++; + Push(§orhead[binphi],iphi); + if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1; Push(§orhead[binphi+1],iphi);}; + if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) {histo_send[binphi-1]++; boundary[iphi] = binphi-1; Push(§orhead[binphi-1],iphi);}; + } + +#ifdef PIPPO + struct sectorlist * current; + long iiii = 0; + for (int j=0; j<nsectors; j++) + { + current = sectorhead[j]; + iiii = 0; + while (current->index != -1) + { + printf("%d %d %ld %ld %ld\n",rank,j,iiii,histo_send[j],current->index); + current = current->next; + iiii++; + } + } +#endif + + #ifdef VERBOSE + for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]); + #endif + + // Create sector grid + double * gridss; + double * gridss_w; + double * gridss_real; + double * gridss_img; + double * grid; + long size_of_grid; + size_of_grid = 2*num_w_planes*xaxis*yaxis; + gridss = (double*) calloc(size_of_grid,sizeof(double)); + gridss_w = (double*) calloc(size_of_grid,sizeof(double)); + gridss_real = (double*) calloc(size_of_grid/2,sizeof(double)); + gridss_img = (double*) calloc(size_of_grid/2,sizeof(double)); + // Create destination slab + grid = (double*) calloc(size_of_grid,sizeof(double)); + // Create temporary global grid + #ifndef USE_MPI + double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double)); + #endif + + double shift = (double)(dx*yaxis); + + // Open the MPI Memory Window for the slab + #ifdef USE_MPI + MPI_Win slabwin; + MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin); + MPI_Win_fence(0,slabwin); + #endif + + #ifndef USE_MPI + pFile1 = fopen (outfile1,"w"); + #endif + + // loop over files + // + kernel_time = 0.0; + kernel_time1 = 0.0; + reduce_time = 0.0; + reduce_time1 = 0.0; + compose_time = 0.0; + compose_time1 = 0.0; + for (int ifiles=0; ifiles<ndatasets; ifiles++) + { + + strcpy(filename,datapath_multi[ifiles]); + printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets); + strcat(filename,weightsfile); + + pFile = fopen (filename,"rb"); + fseek (pFile,startrow*polarisations*sizeof(float),SEEK_SET); + fread(weights,(Nweights)*sizeof(float),1,pFile); + fclose(pFile); + + strcpy(filename,datapath); + strcat(filename,visrealfile); + //printf("Reading %s\n",filename); + + pFile = fopen (filename,"rb"); + fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET); + fread(visreal,Nvis*sizeof(float),1,pFile); + fclose(pFile); + + strcpy(filename,datapath); + strcat(filename,visimgfile); + //printf("Reading %s\n",filename); + + pFile = fopen (filename,"rb"); + fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET); + fread(visimg,Nvis*sizeof(float),1,pFile); + fclose(pFile); + + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif + + // Declare temporary arrays for the masking + double * uus; + double * vvs; + double * wws; + float * visreals; + float * visimgs; + float * weightss; + + long isector; + for (long isector_count=0; isector_count<nsectors; isector_count++) + { + clock_gettime(CLOCK_MONOTONIC, &begink); + startk = clock(); + // define local destination sector + //isector = (isector_count+rank)%size; + isector = isector_count; + // allocate sector arrays + long Nsec = histo_send[isector]; + uus = (double*) malloc(Nsec*sizeof(double)); + vvs = (double*) malloc(Nsec*sizeof(double)); + wws = (double*) malloc(Nsec*sizeof(double)); + long Nweightss = Nsec*polarisations; + long Nvissec = Nweightss*freq_per_chan; + weightss = (float*) malloc(Nweightss*sizeof(float)); + visreals = (float*) malloc(Nvissec*sizeof(float)); + visimgs = (float*) malloc(Nvissec*sizeof(float)); + + // select data for this sector + long icount = 0; + long ip = 0; + long inu = 0; +//CLAAAA + struct sectorlist * current; + current = sectorhead[isector]; + while (current->index != -1) + { + long ilocal = current->index; + //double vvh = vv[ilocal]; + //int binphi = (int)(vvh*nsectors); + //if (binphi == isector || boundary[ilocal] == isector) + //{ + uus[icount] = uu[ilocal]; + vvs[icount] = vv[ilocal]-isector*shift; + wws[icount] = ww[ilocal]; + for (long ipol=0; ipol<polarisations; ipol++) + { + weightss[ip] = weights[ilocal*polarisations+ipol]; + ip++; + } + for (long ifreq=0; ifreq<polarisations*freq_per_chan; ifreq++) + { + visreals[inu] = visreal[ilocal*polarisations*freq_per_chan+ifreq]; + visimgs[inu] = visimg[ilocal*polarisations*freq_per_chan+ifreq]; + //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*polarisations*freq_per_chan+ifreq,Nvis); + inu++; + } + icount++; + //} + current = current->next; + } + + clock_gettime(CLOCK_MONOTONIC, &finishk); + endk = clock(); + compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; + compose_time1 += (finishk.tv_sec - begink.tv_sec); + compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; + #ifndef USE_MPI + double uumin = 1e20; + double vvmin = 1e20; + double uumax = -1e20; + double vvmax = -1e20; + + for (long ipart=0; ipart<Nsec; ipart++) + { + uumin = MIN(uumin,uus[ipart]); + uumax = MAX(uumax,uus[ipart]); + vvmin = MIN(vvmin,vvs[ipart]); + vvmax = MAX(vvmax,vvs[ipart]); + + + if(ipart%10 == 0)fprintf (pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]); + } + + printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); + #endif + + // Make convolution on the grid + #ifdef VERBOSE + printf("Processing sector %ld\n",isector); + #endif + clock_gettime(CLOCK_MONOTONIC, &begink); + startk = clock(); + wstack(num_w_planes, + Nsec, + freq_per_chan, + polarisations, + uus, + vvs, + wws, + visreals, + visimgs, + weightss, + dx, + dw, + w_support, + xaxis, + yaxis, + gridss, + num_threads); + clock_gettime(CLOCK_MONOTONIC, &finishk); + endk = clock(); + kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; + kernel_time1 += (finishk.tv_sec - begink.tv_sec); + kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; + #ifdef VERBOSE + printf("Processed sector %ld\n",isector); + #endif + clock_gettime(CLOCK_MONOTONIC, &begink); + startk = clock(); + + //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]); + + #ifndef USE_MPI + long stride = isector*2*xaxis*yaxis*num_w_planes; + for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)gridtot[stride+iii] = gridss[iii]; + #endif + + // Write grid in the corresponding remote slab + #ifdef USE_MPI + int target_rank = (int)isector; + //int target_rank = (int)(size-isector-1); + #ifdef ONE_SIDE + printf("One Side communication active\n"); + MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin); + MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); + MPI_Win_unlock(target_rank,slabwin); + //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin); + #else + MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); + #endif //ONE_SIDE + #endif //USE_MPI + + clock_gettime(CLOCK_MONOTONIC, &finishk); + endk = clock(); + reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; + reduce_time1 += (finishk.tv_sec - begink.tv_sec); + reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; + // Go to next sector + for (long inull=0; inull<2*num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0; + + // Deallocate all sector arrays + free(uus); + free(vvs); + free(wws); + free(weightss); + free(visreals); + free(visimgs); + // End of loop over sectors + } + // End of loop over input files + } + + // Finalize MPI communication + #ifdef USE_MPI + MPI_Win_fence(0,slabwin); + #endif + + // Swap left and right parts APPARENTLY NOT NECESSARY + /* + for (long kswap=0; kswap<num_w_planes; kswap++) + for (long jswap=0; jswap<yaxis; jswap++) + for (long iswap=0; iswap<xaxis; iswap++) + { + long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis); + gridss[index_origin] = grid[index_origin]; + gridss[index_origin+1] = grid[index_origin+1]; + } + for (long kswap=0; kswap<num_w_planes; kswap++) + for (long jswap=0; jswap<yaxis; jswap++) + for (long iswap=0; iswap<xaxis/2; iswap++) + { + long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis); + long index_destination = 2*(iswap+xaxis/2 + jswap*xaxis + kswap*yaxis*xaxis); + grid[index_destination] = gridss[index_origin]; + grid[index_destination+1] = gridss[index_origin+1]; + } + + for (long kswap=0; kswap<num_w_planes; kswap++) + for (long jswap=0; jswap<yaxis; jswap++) + for (long iswap=xaxis/2; iswap<xaxis; iswap++) + { + long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis); + long index_destination = 2*(iswap-xaxis/2 + jswap*xaxis + kswap*yaxis*xaxis); + grid[index_destination] = gridss[index_origin]; + grid[index_destination+1] = gridss[index_origin+1]; + } + */ + + #ifndef USE_MPI + fclose(pFile1); + #endif + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif + + end = clock(); + clock_gettime(CLOCK_MONOTONIC, &finish); + process_time = ((double) (end - start)) / CLOCKS_PER_SEC; + process_time1 = (finish.tv_sec - begin.tv_sec); + process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; + clock_gettime(CLOCK_MONOTONIC, &begin); + + +#ifdef WRITE_DATA + // Write results + if (rank == 0) + { + printf("WRITING GRIDDED DATA\n"); + pFilereal = fopen (outfile2,"wb"); + pFileimg = fopen (outfile3,"wb"); + #ifdef USE_MPI + for (int isector=0; isector<nsectors; isector++) + { + MPI_Win_lock(MPI_LOCK_SHARED,isector,0,slabwin); + MPI_Get(gridss,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,slabwin); + MPI_Win_unlock(isector,slabwin); + for (long i=0; i<size_of_grid/2; i++) + { + gridss_real[i] = gridss[2*i]; + gridss_img[i] = gridss[2*i+1]; + } + if (num_w_planes > 1) + { + for (int iw=0; iw<num_w_planes; iw++) + for (int iv=0; iv<yaxis; iv++) + for (int iu=0; iu<xaxis; iu++) + { + long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double); + long index = iu + iv*xaxis + iw*xaxis*yaxis; + fseek(pFilereal, global_index, SEEK_SET); + fwrite(&gridss_real[index], 1, sizeof(double), pFilereal); + } + for (int iw=0; iw<num_w_planes; iw++) + for (int iv=0; iv<yaxis; iv++) + for (int iu=0; iu<xaxis; iu++) + { + long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double); + long index = iu + iv*xaxis + iw*xaxis*yaxis; + fseek(pFileimg, global_index, SEEK_SET); + fwrite(&gridss_img[index], 1, sizeof(double), pFileimg); + //double v_norm = sqrt(gridss[index]*gridss[index]+gridss[index+1]*gridss[index+1]); + //fprintf (pFile, "%d %d %d %f %f %f\n", iu,isector*yaxis+iv,iw,gridss[index],gridss[index+1],v_norm); + } + + } else { + for (int iw=0; iw<num_w_planes; iw++) + { + long global_index = (xaxis*isector*yaxis + iw*grid_size_x*grid_size_y)*sizeof(double); + long index = iw*xaxis*yaxis; + fseek(pFilereal, global_index, SEEK_SET); + fwrite(&gridss_real[index], xaxis*yaxis, sizeof(double), pFilereal); + fseek(pFileimg, global_index, SEEK_SET); + fwrite(&gridss_img[index], xaxis*yaxis, sizeof(double), pFileimg); + } + } + } + #else + for (int iw=0; iw<num_w_planes; iw++) + for (int iv=0; iv<grid_size_y; iv++) + for (int iu=0; iu<grid_size_x; iu++) + { + long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y); + fwrite(&gridtot[index], 1, sizeof(double), pFilereal); + fwrite(&gridtot[index+1], 1, sizeof(double), pFileimg); + //double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]); + //fprintf (pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm); + } + #endif + fclose(pFilereal); + fclose(pFileimg); + } + + #ifdef USE_MPI + MPI_Win_fence(0,slabwin); + #endif +#endif //WRITE_DATA + + +#ifdef USE_FFTW + // FFT transform the data (using distributed FFTW) + + if(rank == 0)printf("PERFORMING FFT\n"); + clock_gettime(CLOCK_MONOTONIC, &begin); + start = clock(); + fftw_plan plan; + fftw_complex *fftwgrid; + ptrdiff_t alloc_local, local_n0, local_0_start; + double norm = 1.0/(double)(grid_size_x*grid_size_y); + + // map the 1D array of complex visibilities to a 2D array required by FFTW (complex[*][2]) + // x is the direction of contiguous data and maps to the second parameter + // y is the parallelized direction and corresponds to the first parameter (--> n0) + // and perform the FFT per w plane + alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start); + fftwgrid = fftw_alloc_complex(alloc_local); + plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); + + long fftwindex = 0; + long fftwindex2D = 0; + for (int iw=0; iw<num_w_planes; iw++) + { + //printf("FFTing plan %d\n",iw); + // select the w-plane to transform + for (int iv=0; iv<yaxis; iv++) + { + for (int iu=0; iu<xaxis; iu++) + { + fftwindex2D = iu + iv*xaxis; + fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); + fftwgrid[fftwindex2D][0] = grid[fftwindex]; + fftwgrid[fftwindex2D][1] = grid[fftwindex+1]; + } + } + + // do the transform for each w-plane + fftw_execute(plan); + + // save the transformed w-plane + for (int iv=0; iv<yaxis; iv++) + { + for (int iu=0; iu<xaxis; iu++) + { + fftwindex2D = iu + iv*xaxis; + fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); + gridss[fftwindex] = norm*fftwgrid[fftwindex2D][0]; + gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D][1]; + } + } + + } + + fftw_destroy_plan(plan); + + #ifdef USE_MPI + MPI_Win_fence(0,slabwin); + MPI_Barrier(MPI_COMM_WORLD); + #endif + + end = clock(); + clock_gettime(CLOCK_MONOTONIC, &finish); + fftw_time = ((double) (end - start)) / CLOCKS_PER_SEC; + fftw_time1 = (finish.tv_sec - begin.tv_sec); + fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; + clock_gettime(CLOCK_MONOTONIC, &begin); + +#ifdef WRITE_DATA + // Write results + #ifdef USE_MPI + MPI_Win writewin; + MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin); + MPI_Win_fence(0,writewin); + #endif + if (rank == 0) + { + printf("WRITING FFT TRANSFORMED DATA\n"); + pFilereal = fopen (fftfile2,"wb"); + pFileimg = fopen (fftfile3,"wb"); + #ifdef USE_MPI + for (int isector=0; isector<nsectors; isector++) + { + MPI_Win_lock(MPI_LOCK_SHARED,isector,0,writewin); + MPI_Get(gridss_w,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,writewin); + MPI_Win_unlock(isector,writewin); + for (long i=0; i<size_of_grid/2; i++) + { + gridss_real[i] = gridss_w[2*i]; + gridss_img[i] = gridss_w[2*i+1]; + } + if (num_w_planes > 1) + { + for (int iw=0; iw<num_w_planes; iw++) + for (int iv=0; iv<yaxis; iv++) + for (int iu=0; iu<xaxis; iu++) + { + long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double); + long index = iu + iv*xaxis + iw*xaxis*yaxis; + fseek(pFilereal, global_index, SEEK_SET); + fwrite(&gridss_real[index], 1, sizeof(double), pFilereal); + } + for (int iw=0; iw<num_w_planes; iw++) + for (int iv=0; iv<yaxis; iv++) + for (int iu=0; iu<xaxis; iu++) + { + long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double); + long index = iu + iv*xaxis + iw*xaxis*yaxis; + fseek(pFileimg, global_index, SEEK_SET); + fwrite(&gridss_img[index], 1, sizeof(double), pFileimg); + } + } else { + fwrite(gridss_real, size_of_grid/2, sizeof(double), pFilereal); + fwrite(gridss_img, size_of_grid/2, sizeof(double), pFileimg); + } + + + } + #else + /* + for (int iw=0; iw<num_w_planes; iw++) + for (int iv=0; iv<grid_size_y; iv++) + for (int iu=0; iu<grid_size_x; iu++) + { + int isector = 0; + long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y); + double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]); + fprintf (pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm); + } + */ + #endif + fclose(pFilereal); + fclose(pFileimg); + } + + #ifdef USE_MPI + //MPI_Win_fence(0,writewin); + MPI_Win_fence(0,writewin); + MPI_Win_free(&writewin); + MPI_Barrier(MPI_COMM_WORLD); + #endif +#endif //WRITE_DATA + + fftw_free(fftwgrid); + +#endif //USE_FFTW + + end = clock(); + clock_gettime(CLOCK_MONOTONIC, &finish); + tot_time = ((double) (end - start0)) / CLOCKS_PER_SEC; + tot_time1 = (finish.tv_sec - begin0.tv_sec); + tot_time1 += (finish.tv_nsec - begin0.tv_nsec) / 1000000000.0; + + if (rank == 0) + { + printf("Setup time: %f sec\n",setup_time); + printf("Process time: %f sec\n",process_time); + printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",kernel_time,compose_time,reduce_time); +#ifdef USE_FFTW + printf("FFTW time: %f sec\n",fftw_time); +#endif + printf("TOT time: %f sec\n",tot_time); + if(num_threads > 1) + { + printf("PSetup time: %f sec\n",setup_time1); + printf("PProcess time: %f sec\n",process_time1); + printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",kernel_time1,compose_time1,reduce_time1); +#ifdef USE_FFTW + printf("PFFTW time: %f sec\n",fftw_time1); +#endif + printf("PTOT time: %f sec\n",tot_time1); + } + } + + + // Close MPI environment + #ifdef USE_MPI + MPI_Win_fence(0,slabwin); + MPI_Win_free(&slabwin); + MPI_Finalize(); + #endif +} diff --git a/w-stacking.cu b/w-stacking.cu new file mode 100644 index 0000000000000000000000000000000000000000..9d91d76a27627505cd2da7b487eedf57c49fbe96 --- /dev/null +++ b/w-stacking.cu @@ -0,0 +1,306 @@ +#ifdef _OPENMP +#include <omp.h> +#endif +#include "w-stacking.h" +#include <math.h> +#include <stdlib.h> +#include <stdio.h> +#define NWORKERS -1 //100 +#define NTHREADS 32 + +#ifdef __CUDACC__ +double __device__ +#else +double +#endif +gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) +{ + double conv_weight; + conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); + return conv_weight; +} + +#ifdef __CUDACC__ +//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) +//{ +// double conv_weight; +// conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); +// return conv_weight; +//} + +__global__ void convolve_g( + int num_w_planes, + long num_points, + long freq_per_chan, + long polarizations, + double* uu, + double* vv, + double* ww, + float* vis_real, + float* vis_img, + float* weight, + double dx, + double dw, + int KernelLen, + int grid_size_x, + int grid_size_y, + double* grid, + double std22) + +{ + //printf("DENTRO AL KERNEL\n"); + long gid = blockIdx.x*blockDim.x + threadIdx.x; + if(gid < num_points) + { + long i = gid; + long visindex = i*freq_per_chan*polarizations; + double norm = std22/PI; + + int j, k; + + /* Convert UV coordinates to grid coordinates. */ + double pos_u = uu[i] / dx; + double pos_v = vv[i] / dx; + double ww_i = ww[i] / dw; + int grid_w = (int)ww_i; + int grid_u = (int)pos_u; + int grid_v = (int)pos_v; + + // check the boundaries + long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; + long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; + long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; + long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; + //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax); + + + // Convolve this point onto the grid. + for (k = kmin; k <= kmax; k++) + { + + double v_dist = (double)k+0.5 - pos_v; + + for (j = jmin; j <= jmax; j++) + { + double u_dist = (double)j+0.5 - pos_u; + long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); + //printf("--> %ld %d %d %d\n",iKer,j,k,grid_w); + + double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); + // Loops over frequencies and polarizations + double add_term_real = 0.0; + double add_term_img = 0.0; + long ifine = visindex; + for (long ifreq=0; ifreq<freq_per_chan; ifreq++) + { + long iweight = visindex/freq_per_chan; + for (long ipol=0; ipol<polarizations; ipol++) + { + double vistest = (double)vis_real[ifine]; + if (!isnan(vistest)) + { + add_term_real += weight[iweight] * vis_real[ifine] * conv_weight; + add_term_img += weight[iweight] * vis_img[ifine] * conv_weight; + } + ifine++; + iweight++; + } + } + atomicAdd(&(grid[iKer]),add_term_real); + atomicAdd(&(grid[iKer+1]),add_term_img); + } + } + } +} +#endif + +void wstack( + int num_w_planes, + long num_points, + long freq_per_chan, + long polarizations, + double* uu, + double* vv, + double* ww, + float* vis_real, + float* vis_img, + float* weight, + double dx, + double dw, + int w_support, + int grid_size_x, + int grid_size_y, + double* grid, + int num_threads) +{ + long i; + long index; + long visindex; + + // initialize the convolution kernel + // gaussian: + int KernelLen = (w_support-1)/2; + double std = 1.0; + double std22 = 1.0/(2.0*std*std); + double norm = std22/PI; + + // Loop over visibilities. +// Switch between CUDA and GPU versions +#ifdef __CUDACC__ + // Define the CUDA set up + int Nth = NTHREADS; + int Nbl = num_points/Nth + 1; + if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; + long Nvis = num_points*freq_per_chan*polarizations; + printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl); + + // Create GPU arrays and offload them + double * uu_g; + double * vv_g; + double * ww_g; + float * vis_real_g; + float * vis_img_g; + float * weight_g; + double * grid_g; + + //for (int i=0; i<100000; i++)grid[i]=23.0; + cudaError_t mmm; + mmm=cudaMalloc(&uu_g,num_points*sizeof(double)); + mmm=cudaMalloc(&vv_g,num_points*sizeof(double)); + mmm=cudaMalloc(&ww_g,num_points*sizeof(double)); + mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float)); + mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float)); + mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float)); + mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); + mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); + + mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice); + mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice); + mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice); + mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice); + mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice); + mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice); + + // Call main GPU Kernel + convolve_g <<<Nbl,Nth>>> ( + num_w_planes, + num_points, + freq_per_chan, + polarizations, + uu_g, + vv_g, + ww_g, + vis_real_g, + vis_img_g, + weight_g, + dx, + dw, + KernelLen, + grid_size_x, + grid_size_y, + grid_g, + std22); + + mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost); + //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); + printf("CUDA ERROR %s\n",cudaGetErrorString(mmm)); + mmm=cudaFree(uu_g); + mmm=cudaFree(vv_g); + mmm=cudaFree(ww_g); + mmm=cudaFree(vis_real_g); + mmm=cudaFree(vis_img_g); + mmm=cudaFree(weight_g); + mmm=cudaFree(grid_g); + +// Switch between CUDA and GPU versions +# else + +#ifdef _OPENMP + omp_set_num_threads(num_threads); +#endif + #pragma omp parallel for private(visindex) + for (i = 0; i < num_points; i++) + { +#ifdef _OPENMP + //int tid; + //tid = omp_get_thread_num(); + //printf("%d\n",tid); +#endif + + visindex = i*freq_per_chan*polarizations; + + double sum = 0.0; + int j, k; + //if (i%1000 == 0)printf("%ld\n",i); + + /* Convert UV coordinates to grid coordinates. */ + double pos_u = uu[i] / dx; + double pos_v = vv[i] / dx; + double ww_i = ww[i] / dw; + int grid_w = (int)ww_i; + int grid_u = (int)pos_u; + int grid_v = (int)pos_v; + + // check the boundaries + long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; + long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; + long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; + long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; + //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax); + + + // Convolve this point onto the grid. + for (k = kmin; k <= kmax; k++) + { + + double v_dist = (double)k+0.5 - pos_v; + + for (j = jmin; j <= jmax; j++) + { + double u_dist = (double)j+0.5 - pos_u; + long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); + + double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); + // Loops over frequencies and polarizations + double add_term_real = 0.0; + double add_term_img = 0.0; + long ifine = visindex; + // DAV: the following two loops are performend by each thread separately: no problems of race conditions + for (long ifreq=0; ifreq<freq_per_chan; ifreq++) + { + long iweight = visindex/freq_per_chan; + for (long ipol=0; ipol<polarizations; ipol++) + { + if (!isnan(vis_real[ifine])) + { + //printf("%f %ld\n",weight[iweight],iweight); + add_term_real += weight[iweight] * vis_real[ifine] * conv_weight; + add_term_img += weight[iweight] * vis_img[ifine] * conv_weight; + //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations); + } + ifine++; + iweight++; + } + } + // DAV: this is the critical call in terms of correctness of the results and of performance + #pragma omp atomic + grid[iKer] += add_term_real; + #pragma omp atomic + grid[iKer+1] += add_term_img; + } + } + + } +// End switch between CUDA and CPU versions +#endif + //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); +} + +int test(int nnn) +{ + int mmm; + + mmm = nnn+1; + return mmm; +} + diff --git a/w-stacking.h b/w-stacking.h new file mode 100644 index 0000000000000000000000000000000000000000..4b8bdc1a10c3212509537eea9e455b39d839e241 --- /dev/null +++ b/w-stacking.h @@ -0,0 +1,32 @@ +#ifndef W_PROJECT_H_ +#define W_PROJECT_H_ + +#define PI 3.14159265359 +#define REAL_TYPE double +#ifdef __CUDACC__ +extern "C" +#endif +void wstack( + int, + long, + long, + long, + double*, + double*, + double*, + float*, + float*, + float*, + double, + double, + int, + int, + int, + double*, + int); +#ifdef __CUDACC__ +extern "C" +#endif +int test(int nnn); + +#endif