first commit

9bf1af25 · Claudio Gheller · c103410e · 9bf1af25 · 9bf1af25 · 9bf1af25
Commit 9bf1af25 authored 3 years ago by Claudio Gheller
--- a/Makefile
+++ b/Makefile
+OPT += -DUSE_MPI
+OPT += -DUSE_FFTW
+OPT += -DONE_SIDE
+OPT += -DWRITE_DATA
+
+CC = gcc
+CXX = g++
+MPICC = mpicc
+MPICXX =mpiCC 
+
+CFLAGS += -O3 -mcpu=native
+CFLAGS += -I.
+LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
+
+NVCC = nvcc
+NVFLAGS = -arch=sm_70 -c w-stacking.cu -Xcompiler -mno-float128 -std=c++11
+NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
+
+DEPS = w-stacking.h
+COBJ = w-stacking.o w-stacking-fftw.o
+
+w-stacking.c:
+	cp w-stacking.cu w-stacking.c
+
+%.o: %.c $(DEPS)
+	$(CC) -c -o $@ $< $(CFLAGS) $(OPT)
+
+serial: $(COBJ)
+	$(CC) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm
+
+serial_cuda:
+	$(NVCC) $(NVFLAGS) -c w-stacking.cu $(NVLIB)
+	$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
+	$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o $(NVLIB) -lm
+
+mpi: $(COBJ)
+	$(MPICC) -o w-stackingCfftw $(CFLAGS) $^ $(LIBS)
+
+mpi_cuda:
+	$(NVCC) $(NVFLAGS) -c w-stacking.cu $(NVLIB)
+	$(MPICC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
+	$(MPICXX) $(CFLAGS) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o $(NVLIB) $(LIBS) -lm
+
+clean:
+	rm *.o
+	rm w-stacking.c
+
--- a/peano.c
+++ b/peano.c
+#include "peano.h"
+
+//typedef unsigned long long peanokey;
+
+/*  The following rewrite of the original function
+ *  peano_hilbert_key_old() has been written by MARTIN REINECKE.
+ *  It is about a factor 2.3 - 2.5 faster than Volker's old routine!
+ */
+const unsigned char rottable3[48][8] = {
+  {36, 28, 25, 27, 10, 10, 25, 27},
+  {29, 11, 24, 24, 37, 11, 26, 26},
+  {8, 8, 25, 27, 30, 38, 25, 27},
+  {9, 39, 24, 24, 9, 31, 26, 26},
+  {40, 24, 44, 32, 40, 6, 44, 6},
+  {25, 7, 33, 7, 41, 41, 45, 45},
+  {4, 42, 4, 46, 26, 42, 34, 46},
+  {43, 43, 47, 47, 5, 27, 5, 35},
+  {33, 35, 36, 28, 33, 35, 2, 2},
+  {32, 32, 29, 3, 34, 34, 37, 3},
+  {33, 35, 0, 0, 33, 35, 30, 38},
+  {32, 32, 1, 39, 34, 34, 1, 31},
+  {24, 42, 32, 46, 14, 42, 14, 46},
+  {43, 43, 47, 47, 25, 15, 33, 15},
+  {40, 12, 44, 12, 40, 26, 44, 34},
+  {13, 27, 13, 35, 41, 41, 45, 45},
+  {28, 41, 28, 22, 38, 43, 38, 22},
+  {42, 40, 23, 23, 29, 39, 29, 39},
+  {41, 36, 20, 36, 43, 30, 20, 30},
+  {37, 31, 37, 31, 42, 40, 21, 21},
+  {28, 18, 28, 45, 38, 18, 38, 47},
+  {19, 19, 46, 44, 29, 39, 29, 39},
+  {16, 36, 45, 36, 16, 30, 47, 30},
+  {37, 31, 37, 31, 17, 17, 46, 44},
+  {12, 4, 1, 3, 34, 34, 1, 3},
+  {5, 35, 0, 0, 13, 35, 2, 2},
+  {32, 32, 1, 3, 6, 14, 1, 3},
+  {33, 15, 0, 0, 33, 7, 2, 2},
+  {16, 0, 20, 8, 16, 30, 20, 30},
+  {1, 31, 9, 31, 17, 17, 21, 21},
+  {28, 18, 28, 22, 2, 18, 10, 22},
+  {19, 19, 23, 23, 29, 3, 29, 11},
+  {9, 11, 12, 4, 9, 11, 26, 26},
+  {8, 8, 5, 27, 10, 10, 13, 27},
+  {9, 11, 24, 24, 9, 11, 6, 14},
+  {8, 8, 25, 15, 10, 10, 25, 7},
+  {0, 18, 8, 22, 38, 18, 38, 22},
+  {19, 19, 23, 23, 1, 39, 9, 39},
+  {16, 36, 20, 36, 16, 2, 20, 10},
+  {37, 3, 37, 11, 17, 17, 21, 21},
+  {4, 17, 4, 46, 14, 19, 14, 46},
+  {18, 16, 47, 47, 5, 15, 5, 15},
+  {17, 12, 44, 12, 19, 6, 44, 6},
+  {13, 7, 13, 7, 18, 16, 45, 45},
+  {4, 42, 4, 21, 14, 42, 14, 23},
+  {43, 43, 22, 20, 5, 15, 5, 15},
+  {40, 12, 21, 12, 40, 6, 23, 6},
+  {13, 7, 13, 7, 41, 41, 22, 20}
+};
+
+const unsigned char subpix3[48][8] = {
+  {0, 7, 1, 6, 3, 4, 2, 5},
+  {7, 4, 6, 5, 0, 3, 1, 2},
+  {4, 3, 5, 2, 7, 0, 6, 1},
+  {3, 0, 2, 1, 4, 7, 5, 6},
+  {1, 0, 6, 7, 2, 3, 5, 4},
+  {0, 3, 7, 4, 1, 2, 6, 5},
+  {3, 2, 4, 5, 0, 1, 7, 6},
+  {2, 1, 5, 6, 3, 0, 4, 7},
+  {6, 1, 7, 0, 5, 2, 4, 3},
+  {1, 2, 0, 3, 6, 5, 7, 4},
+  {2, 5, 3, 4, 1, 6, 0, 7},
+  {5, 6, 4, 7, 2, 1, 3, 0},
+  {7, 6, 0, 1, 4, 5, 3, 2},
+  {6, 5, 1, 2, 7, 4, 0, 3},
+  {5, 4, 2, 3, 6, 7, 1, 0},
+  {4, 7, 3, 0, 5, 6, 2, 1},
+  {6, 7, 5, 4, 1, 0, 2, 3},
+  {7, 0, 4, 3, 6, 1, 5, 2},
+  {0, 1, 3, 2, 7, 6, 4, 5},
+  {1, 6, 2, 5, 0, 7, 3, 4},
+  {2, 3, 1, 0, 5, 4, 6, 7},
+  {3, 4, 0, 7, 2, 5, 1, 6},
+  {4, 5, 7, 6, 3, 2, 0, 1},
+  {5, 2, 6, 1, 4, 3, 7, 0},
+  {7, 0, 6, 1, 4, 3, 5, 2},
+  {0, 3, 1, 2, 7, 4, 6, 5},
+  {3, 4, 2, 5, 0, 7, 1, 6},
+  {4, 7, 5, 6, 3, 0, 2, 1},
+  {6, 7, 1, 0, 5, 4, 2, 3},
+  {7, 4, 0, 3, 6, 5, 1, 2},
+  {4, 5, 3, 2, 7, 6, 0, 1},
+  {5, 6, 2, 1, 4, 7, 3, 0},
+  {1, 6, 0, 7, 2, 5, 3, 4},
+  {6, 5, 7, 4, 1, 2, 0, 3},
+  {5, 2, 4, 3, 6, 1, 7, 0},
+  {2, 1, 3, 0, 5, 6, 4, 7},
+  {0, 1, 7, 6, 3, 2, 4, 5},
+  {1, 2, 6, 5, 0, 3, 7, 4},
+  {2, 3, 5, 4, 1, 0, 6, 7},
+  {3, 0, 4, 7, 2, 1, 5, 6},
+  {1, 0, 2, 3, 6, 7, 5, 4},
+  {0, 7, 3, 4, 1, 6, 2, 5},
+  {7, 6, 4, 5, 0, 1, 3, 2},
+  {6, 1, 5, 2, 7, 0, 4, 3},
+  {5, 4, 6, 7, 2, 3, 1, 0},
+  {4, 3, 7, 0, 5, 2, 6, 1},
+  {3, 2, 0, 1, 4, 5, 7, 6},
+  {2, 5, 1, 6, 3, 4, 0, 7}
+};
+
+/*! This function computes a Peano-Hilbert key for an integer triplet (x,y,z),
+  *  with x,y,z in the range between 0 and 2^bits-1.
+  */
+peanokey peano_hilbert_key(int x, int y, int z, int bits)
+{
+  int mask;
+  unsigned char rotation = 0;
+  peanokey key = 0;
+
+  for(mask = 1 << (bits - 1); mask > 0; mask >>= 1)
+    {
+      unsigned char pix = ((x & mask) ? 4 : 0) | ((y & mask) ? 2 : 0) | ((z & mask) ? 1 : 0);
+
+      key <<= 3;
+      key |= subpix3[rotation][pix];
+      rotation = rottable3[rotation][pix];
+    }
+
+  return key;
+}
--- a/peano.h
+++ b/peano.h
+typedef unsigned long long peanokey;
+
+peanokey peano_hilbert_key(int, int, int, int);
--- a/w-stacking-fftw.c
+++ b/w-stacking-fftw.c
--- a/w-stacking.cu
+++ b/w-stacking.cu
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include "w-stacking.h"
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#define NWORKERS -1    //100
+#define NTHREADS 32
+
+#ifdef __CUDACC__
+double __device__
+#else
+double
+#endif
+gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
+{
+     double conv_weight;
+     conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
+     return conv_weight;
+}
+
+#ifdef __CUDACC__
+//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
+//{
+//     double conv_weight;
+//     conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
+//     return conv_weight;
+//}
+
+__global__ void convolve_g(
+     int num_w_planes,
+     long num_points,
+     long freq_per_chan,
+     long polarizations,
+     double* uu,
+     double* vv,
+     double* ww,
+     float* vis_real,
+     float* vis_img,
+     float* weight,
+     double dx,
+     double dw,
+     int KernelLen,
+     int grid_size_x,
+     int grid_size_y,
+     double* grid,
+     double std22)
+
+{
+	//printf("DENTRO AL KERNEL\n");
+        long gid = blockIdx.x*blockDim.x + threadIdx.x;
+	if(gid < num_points)
+	{
+	long i = gid;
+        long visindex = i*freq_per_chan*polarizations;
+	double norm = std22/PI;
+
+        int j, k;
+
+        /* Convert UV coordinates to grid coordinates. */
+        double pos_u = uu[i] / dx;
+        double pos_v = vv[i] / dx;
+        double ww_i  = ww[i] / dw;
+        int grid_w = (int)ww_i;
+        int grid_u = (int)pos_u;
+        int grid_v = (int)pos_v;
+
+        // check the boundaries
+        long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
+        long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
+        long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
+        long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
+        //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);
+
+
+        // Convolve this point onto the grid.
+        for (k = kmin; k <= kmax; k++)
+        {
+
+            double v_dist = (double)k+0.5 - pos_v;
+
+            for (j = jmin; j <= jmax; j++)
+            {
+                double u_dist = (double)j+0.5 - pos_u;
+                long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+		//printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);
+
+                double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
+                // Loops over frequencies and polarizations
+                double add_term_real = 0.0;
+                double add_term_img = 0.0;
+                long ifine = visindex;
+                for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
+		{	
+		   long iweight = visindex/freq_per_chan;
+                   for (long ipol=0; ipol<polarizations; ipol++)
+                   {
+                      double vistest = (double)vis_real[ifine];
+                      if (!isnan(vistest))
+                      {
+                         add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
+                         add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
+		      }
+                      ifine++;
+		      iweight++;
+                   }
+		}
+		atomicAdd(&(grid[iKer]),add_term_real);
+		atomicAdd(&(grid[iKer+1]),add_term_img);
+            }
+        }
+	}
+}
+#endif
+
+void wstack(
+     int num_w_planes,
+     long num_points,
+     long freq_per_chan,
+     long polarizations,
+     double* uu,
+     double* vv,
+     double* ww,
+     float* vis_real,
+     float* vis_img,
+     float* weight,
+     double dx,
+     double dw,
+     int w_support,
+     int grid_size_x,
+     int grid_size_y,
+     double* grid,
+     int num_threads)
+{
+    long i;
+    long index;
+    long visindex;
+
+    // initialize the convolution kernel
+    // gaussian:
+    int KernelLen = (w_support-1)/2;
+    double std = 1.0;
+    double std22 = 1.0/(2.0*std*std);
+    double norm = std22/PI;
+
+    // Loop over visibilities.
+// Switch between CUDA and GPU versions
+#ifdef __CUDACC__
+    // Define the CUDA set up
+    int Nth = NTHREADS;
+    int Nbl = num_points/Nth + 1;
+    if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
+    long Nvis = num_points*freq_per_chan*polarizations;
+    printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
+
+    // Create GPU arrays and offload them
+    double * uu_g;
+    double * vv_g;
+    double * ww_g;
+    float * vis_real_g;
+    float * vis_img_g;
+    float * weight_g;
+    double * grid_g;
+
+    //for (int i=0; i<100000; i++)grid[i]=23.0;
+    cudaError_t mmm;
+    mmm=cudaMalloc(&uu_g,num_points*sizeof(double));
+    mmm=cudaMalloc(&vv_g,num_points*sizeof(double));
+    mmm=cudaMalloc(&ww_g,num_points*sizeof(double));
+    mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float));
+    mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
+    mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
+    mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+    mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+
+    mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice);
+
+    // Call main GPU Kernel
+    convolve_g <<<Nbl,Nth>>> (
+	       num_w_planes,
+               num_points,
+               freq_per_chan,
+               polarizations,
+               uu_g,
+               vv_g,
+               ww_g,
+               vis_real_g,
+               vis_img_g,
+               weight_g,
+               dx,
+               dw,
+               KernelLen,
+               grid_size_x,
+               grid_size_y,
+               grid_g,
+	       std22);
+
+    mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost);
+    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
+    printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));
+    mmm=cudaFree(uu_g);
+    mmm=cudaFree(vv_g);
+    mmm=cudaFree(ww_g);
+    mmm=cudaFree(vis_real_g);
+    mmm=cudaFree(vis_img_g);
+    mmm=cudaFree(weight_g);
+    mmm=cudaFree(grid_g);
+
+// Switch between CUDA and GPU versions
+# else
+
+#ifdef _OPENMP
+    omp_set_num_threads(num_threads);
+#endif
+    #pragma omp parallel for private(visindex) 
+    for (i = 0; i < num_points; i++)
+    {
+#ifdef _OPENMP
+	//int tid;
+	//tid = omp_get_thread_num();
+	//printf("%d\n",tid);
+#endif
+
+        visindex = i*freq_per_chan*polarizations;
+
+        double sum = 0.0;
+        int j, k;
+	//if (i%1000 == 0)printf("%ld\n",i);
+
+        /* Convert UV coordinates to grid coordinates. */
+        double pos_u = uu[i] / dx;
+        double pos_v = vv[i] / dx;
+        double ww_i  = ww[i] / dw;
+        int grid_w = (int)ww_i;
+        int grid_u = (int)pos_u;
+        int grid_v = (int)pos_v;
+
+	// check the boundaries
+	long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
+	long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
+	long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
+	long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
+        //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
+
+
+        // Convolve this point onto the grid.
+        for (k = kmin; k <= kmax; k++)
+        {
+
+            double v_dist = (double)k+0.5 - pos_v;
+
+            for (j = jmin; j <= jmax; j++)
+            {
+                double u_dist = (double)j+0.5 - pos_u;
+		long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+
+		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
+		// Loops over frequencies and polarizations
+		double add_term_real = 0.0;
+		double add_term_img = 0.0;
+		long ifine = visindex;
+		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
+		for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
+		{	
+		   long iweight = visindex/freq_per_chan;
+	           for (long ipol=0; ipol<polarizations; ipol++)
+	           {
+                      if (!isnan(vis_real[ifine]))
+                      {
+		         //printf("%f %ld\n",weight[iweight],iweight);
+                         add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
+		         add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
+			 //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
+		      }
+		      ifine++;
+		      iweight++;
+		   }
+	        }
+		// DAV: this is the critical call in terms of correctness of the results and of performance
+		#pragma omp atomic
+		grid[iKer] += add_term_real;
+		#pragma omp atomic
+		grid[iKer+1] += add_term_img;
+            }
+        }
+	
+    }
+// End switch between CUDA and CPU versions
+#endif
+    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
+}
+
+int test(int nnn)
+{
+	int mmm;
+	
+	mmm = nnn+1;
+	return mmm;
+}	
+
--- a/w-stacking.h
+++ b/w-stacking.h
+#ifndef W_PROJECT_H_
+#define W_PROJECT_H_
+
+#define PI 3.14159265359
+#define REAL_TYPE double
+#ifdef __CUDACC__
+extern "C"
+#endif
+void wstack(
+     int,
+     long,
+     long,
+     long,
+     double*,
+     double*,
+     double*,
+     float*,
+     float*,
+     float*,
+     double,
+     double,
+     int,
+     int,
+     int,
+     double*,
+     int);
+#ifdef __CUDACC__
+extern "C"
+#endif
+int test(int nnn);
+
+#endif