From 15e3f465b28ceecf0961ba937db63ad556278882 Mon Sep 17 00:00:00 2001
From: Claudio Gheller <cgheller@login01.m100.cineca.it>
Date: Mon, 2 Jan 2023 10:24:26 +0100
Subject: [PATCH] new degrid function introduced

---
 .gitignore        |   1 +
 Makefile          |  13 +-
 convkernels.c     |  20 +++
 convkernels.cu    |  20 +++
 convkernels.h     |  13 ++
 degrid.c          | 314 ++++++++++++++++++++++++++++++++++++++++++++++
 degrid.cu         | 314 ++++++++++++++++++++++++++++++++++++++++++++++
 degrid.h          |  31 +++++
 inverse-imaging.c | 103 ++++++++-------
 w-stacking.cu     |  15 +--
 10 files changed, 781 insertions(+), 63 deletions(-)
 create mode 100644 convkernels.c
 create mode 100644 convkernels.cu
 create mode 100644 convkernels.h
 create mode 100644 degrid.c
 create mode 100644 degrid.cu
 create mode 100644 degrid.h

diff --git a/.gitignore b/.gitignore
index 8edc32e..c1ad20c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ w-stackingCfftw_serial
 w-stackingfftw_serial
 inverse-imaging
 .*
+*.tar
diff --git a/Makefile b/Makefile
index 93ab885..5477aaf 100644
--- a/Makefile
+++ b/Makefile
@@ -42,12 +42,18 @@ ifeq (FITSIO,$(findstring FITSIO,$(OPT)))
 endif
 
 
-DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu
-COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o
+DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu degrid.cu convkernels.cu
+COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o degrid.o convkernels.o
+
+convkernels.c: convkernels.cu
+	cp convkernels.cu convkernels.c
 
 w-stacking.c: w-stacking.cu
 	cp w-stacking.cu w-stacking.c
 
+degrid.c: degrid.cu
+	cp degrid.cu degrid.c
+
 phase_correction.c: phase_correction.cu
 	cp phase_correction.cu phase_correction.c
 
@@ -78,7 +84,7 @@ serial_cuda:
 
 mpi: $(COBJ)
 	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw   $^  $(CFLAGS) $(LIBS)
-	$(MPICC) $(OPTIMIZE) $(OPT) -o inverse-imaging inverse-imaging.c w-stacking.c $(CFLAGS) $(LIBS)
+	$(MPICC) $(OPTIMIZE) $(OPT) -o inverse-imaging inverse-imaging.c degrid.c convkernels.c $(CFLAGS) $(LIBS)
 
 mpi_cuda:
 	$(NVCC)   $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
@@ -89,3 +95,4 @@ clean:
 	rm *.o
 	rm w-stacking.c
 	rm phase_correction.c
+	rm degrid.c
diff --git a/convkernels.c b/convkernels.c
new file mode 100644
index 0000000..e06872e
--- /dev/null
+++ b/convkernels.c
@@ -0,0 +1,20 @@
+#include "convkernels.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#ifdef ACCOMP
+#pragma omp  declare target
+#endif
+#ifdef __CUDACC__
+double __device__
+#else
+double
+#endif
+gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
+{
+     double conv_weight;
+     conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
+     return conv_weight;
+}
diff --git a/convkernels.cu b/convkernels.cu
new file mode 100644
index 0000000..e06872e
--- /dev/null
+++ b/convkernels.cu
@@ -0,0 +1,20 @@
+#include "convkernels.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#ifdef ACCOMP
+#pragma omp  declare target
+#endif
+#ifdef __CUDACC__
+double __device__
+#else
+double
+#endif
+gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
+{
+     double conv_weight;
+     conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
+     return conv_weight;
+}
diff --git a/convkernels.h b/convkernels.h
new file mode 100644
index 0000000..8194467
--- /dev/null
+++ b/convkernels.h
@@ -0,0 +1,13 @@
+#ifndef W_KERNELS
+#define W_KERNELS
+
+#ifdef ACCOMP
+#pragma omp  declare target
+#endif
+#ifdef __CUDACC__
+double __device__
+#else
+double
+#endif
+gauss_kernel_norm(double, double, double, double);
+#endif
diff --git a/degrid.c b/degrid.c
new file mode 100644
index 0000000..87dca3d
--- /dev/null
+++ b/degrid.c
@@ -0,0 +1,314 @@
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include "convkernels.h"
+#include "degrid.h"
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#ifdef ACCOMP
+#pragma omp end declare target
+#endif
+
+#ifdef __CUDACC__
+//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
+//{
+//     double conv_weight;
+//     conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
+//     return conv_weight;
+//}
+
+__global__ void convolve_g(
+     int num_w_planes,
+     long num_points,
+     long freq_per_chan,
+     long polarizations,
+     double* uu,
+     double* vv,
+     double* ww,
+     float* vis_real,
+     float* vis_img,
+     float* weight,
+     double dx,
+     double dw,
+     int KernelLen,
+     int grid_size_x,
+     int grid_size_y,
+     double* grid,
+     double std22)
+
+{
+        long gid = blockIdx.x*blockDim.x + threadIdx.x;
+	if(gid < num_points)
+	{
+	long i = gid;
+        long visindex = i*freq_per_chan*polarizations;
+	double norm = std22/PI;
+
+        int j, k;
+
+        /* Convert UV coordinates to grid coordinates. */
+        double pos_u = uu[i] / dx;
+        double pos_v = vv[i] / dx;
+        double ww_i  = ww[i] / dw;
+        int grid_w = (int)ww_i;
+        int grid_u = (int)pos_u;
+        int grid_v = (int)pos_v;
+
+        // check the boundaries
+        long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
+        long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
+        long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
+        long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
+        //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);
+
+
+        // Convolve this point onto the grid.
+        for (k = kmin; k <= kmax; k++)
+        {
+
+            double v_dist = (double)k+0.5 - pos_v;
+
+            for (j = jmin; j <= jmax; j++)
+            {
+                double u_dist = (double)j+0.5 - pos_u;
+                long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+		//printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);
+
+                double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
+                // Loops over frequencies and polarizations
+                double add_term_real = 0.0;
+                double add_term_img = 0.0;
+                long ifine = visindex;
+		// FOR THE MOMENT ASSUME ONE FREQUENCY PER CHANNEL AND ONE POLARISAZION
+		// FIRST TERM IS CALCULATED, REMAINING TERMS ARE SKIPPED
+	        // COMMENTED DOUBLE LOOP BELOW IS THE ORIGINAL GRIDDING ALGORITHM (FOR REFERENCE)
+                vis_real[ifine] += grid[iKer]*conv_weight;
+                vis_img[ifine] += grid[iKer]*conv_weight;
+
+		/*
+                for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
+		{
+		   long iweight = visindex/freq_per_chan;
+                   for (long ipol=0; ipol<polarizations; ipol++)
+                   {
+                      double vistest = (double)vis_real[ifine];
+                      if (!isnan(vistest))
+                      {
+                         add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
+                         add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
+		      }
+                      ifine++;
+		      iweight++;
+                   }
+		}
+		atomicAdd(&(grid[iKer]),add_term_real);
+		atomicAdd(&(grid[iKer+1]),add_term_img);
+                */
+
+            }
+        }
+	}
+}
+#endif
+#ifdef ACCOMP
+#pragma  omp declare target
+#endif
+void degrid(
+     int num_w_planes,
+     long num_points,
+     long freq_per_chan,
+     long polarizations,
+     double* uu,
+     double* vv,
+     double* ww,
+     float* vis_real,
+     float* vis_img,
+     float* weight,
+     double dx,
+     double dw,
+     int w_support,
+     int grid_size_x,
+     int grid_size_y,
+     double* grid,
+     int num_threads)
+{
+    long i;
+    long index;
+    long visindex;
+
+    // initialize the convolution kernel
+    // gaussian:
+    int KernelLen = (w_support-1)/2;
+    double std = 1.0;
+    double std22 = 1.0/(2.0*std*std);
+    double norm = std22/PI;
+
+    // Loop over visibilities.
+// Switch between CUDA and GPU versions
+#ifdef __CUDACC__
+    // Define the CUDA set up
+    int Nth = NTHREADS;
+    long Nbl = (long)(num_points/Nth) + 1;
+    if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
+    long Nvis = num_points*freq_per_chan*polarizations;
+    printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
+
+    // Create GPU arrays and offload them
+    double * uu_g;
+    double * vv_g;
+    double * ww_g;
+    float * vis_real_g;
+    float * vis_img_g;
+    float * weight_g;
+    double * grid_g;
+
+    //for (int i=0; i<100000; i++)grid[i]=23.0;
+    cudaError_t mmm;
+    mmm=cudaMalloc(&uu_g,num_points*sizeof(double));
+    mmm=cudaMalloc(&vv_g,num_points*sizeof(double));
+    mmm=cudaMalloc(&ww_g,num_points*sizeof(double));
+    mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float));
+    mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
+    mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
+    mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+    mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+
+    mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice);
+
+    // Call main GPU Kernel
+    convolve_g <<<Nbl,Nth>>> (
+	       num_w_planes,
+               num_points,
+               freq_per_chan,
+               polarizations,
+               uu_g,
+               vv_g,
+               ww_g,
+               vis_real_g,
+               vis_img_g,
+               weight_g,
+               dx,
+               dw,
+               KernelLen,
+               grid_size_x,
+               grid_size_y,
+               grid_g,
+	       std22);
+
+    mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost);
+    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
+    printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));
+    mmm=cudaFree(uu_g);
+    mmm=cudaFree(vv_g);
+    mmm=cudaFree(ww_g);
+    mmm=cudaFree(vis_real_g);
+    mmm=cudaFree(vis_img_g);
+    mmm=cudaFree(weight_g);
+    mmm=cudaFree(grid_g);
+
+// Switch between CUDA and GPU versions
+# else
+
+#ifdef _OPENMP
+    omp_set_num_threads(num_threads);
+#endif
+
+#ifdef ACCOMP
+    long Nvis = num_points*freq_per_chan*polarizations;
+  //  #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan])
+  //  #pragma omp target teams distribute parallel for  map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y])
+#else
+    #pragma omp parallel for private(visindex)
+#endif
+    for (i = 0; i < num_points; i++)
+    {
+#ifdef _OPENMP
+	//int tid;
+	//tid = omp_get_thread_num();
+	//printf("%d\n",tid);
+#endif
+
+        visindex = i*freq_per_chan*polarizations;
+
+        double sum = 0.0;
+        int j, k;
+	//if (i%1000 == 0)printf("%ld\n",i);
+
+        /* Convert UV coordinates to grid coordinates. */
+        double pos_u = uu[i] / dx;
+        double pos_v = vv[i] / dx;
+        double ww_i  = ww[i] / dw;
+        int grid_w = (int)ww_i;
+        int grid_u = (int)pos_u;
+        int grid_v = (int)pos_v;
+
+	// check the boundaries
+	long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
+	long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
+	long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
+	long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
+        //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
+
+
+        // Convolve this point onto the grid.
+        for (k = kmin; k <= kmax; k++)
+        {
+
+            double v_dist = (double)k+0.5 - pos_v;
+
+            for (j = jmin; j <= jmax; j++)
+            {
+                double u_dist = (double)j+0.5 - pos_u;
+		long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+
+
+		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
+		// Loops over frequencies and polarizations
+		double add_term_real = 0.0;
+		double add_term_img = 0.0;
+		long ifine = visindex;
+                // FOR THE MOMENT ASSUME ONE FREQUENCY PER CHANNEL AND ONE POLARISAZION
+                // FIRST TERM IS CALCULATED, REMAINING TERMS ARE SKIPPED
+                // COMMENTED DOUBLE LOOP BELOW IS THE ORIGINAL GRIDDING ALGORITHM (FOR REFERENCE)
+                vis_real[ifine] += grid[iKer]*conv_weight;
+                vis_img[ifine] += grid[iKer]*conv_weight;
+
+                /*
+		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
+		for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
+		{
+		   long iweight = visindex/freq_per_chan;
+	           for (long ipol=0; ipol<polarizations; ipol++)
+	           {
+                      if (!isnan(vis_real[ifine]))
+                      {
+		         //printf("%f %ld\n",weight[iweight],iweight);
+                         add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
+		         add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
+			 //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
+		      }
+		      ifine++;
+		      iweight++;
+		   }
+	        }
+		// DAV: this is the critical call in terms of correctness of the results and of performance
+		#pragma omp atomic
+		grid[iKer] += add_term_real;
+		#pragma omp atomic
+		grid[iKer+1] += add_term_img;
+		*/
+            }
+        }
+
+    }
+// End switch between CUDA and CPU versions
+#endif
+    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
+}
diff --git a/degrid.cu b/degrid.cu
new file mode 100644
index 0000000..87dca3d
--- /dev/null
+++ b/degrid.cu
@@ -0,0 +1,314 @@
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include "convkernels.h"
+#include "degrid.h"
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#ifdef ACCOMP
+#pragma omp end declare target
+#endif
+
+#ifdef __CUDACC__
+//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
+//{
+//     double conv_weight;
+//     conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
+//     return conv_weight;
+//}
+
+__global__ void convolve_g(
+     int num_w_planes,
+     long num_points,
+     long freq_per_chan,
+     long polarizations,
+     double* uu,
+     double* vv,
+     double* ww,
+     float* vis_real,
+     float* vis_img,
+     float* weight,
+     double dx,
+     double dw,
+     int KernelLen,
+     int grid_size_x,
+     int grid_size_y,
+     double* grid,
+     double std22)
+
+{
+        long gid = blockIdx.x*blockDim.x + threadIdx.x;
+	if(gid < num_points)
+	{
+	long i = gid;
+        long visindex = i*freq_per_chan*polarizations;
+	double norm = std22/PI;
+
+        int j, k;
+
+        /* Convert UV coordinates to grid coordinates. */
+        double pos_u = uu[i] / dx;
+        double pos_v = vv[i] / dx;
+        double ww_i  = ww[i] / dw;
+        int grid_w = (int)ww_i;
+        int grid_u = (int)pos_u;
+        int grid_v = (int)pos_v;
+
+        // check the boundaries
+        long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
+        long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
+        long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
+        long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
+        //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);
+
+
+        // Convolve this point onto the grid.
+        for (k = kmin; k <= kmax; k++)
+        {
+
+            double v_dist = (double)k+0.5 - pos_v;
+
+            for (j = jmin; j <= jmax; j++)
+            {
+                double u_dist = (double)j+0.5 - pos_u;
+                long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+		//printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);
+
+                double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
+                // Loops over frequencies and polarizations
+                double add_term_real = 0.0;
+                double add_term_img = 0.0;
+                long ifine = visindex;
+		// FOR THE MOMENT ASSUME ONE FREQUENCY PER CHANNEL AND ONE POLARISAZION
+		// FIRST TERM IS CALCULATED, REMAINING TERMS ARE SKIPPED
+	        // COMMENTED DOUBLE LOOP BELOW IS THE ORIGINAL GRIDDING ALGORITHM (FOR REFERENCE)
+                vis_real[ifine] += grid[iKer]*conv_weight;
+                vis_img[ifine] += grid[iKer]*conv_weight;
+
+		/*
+                for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
+		{
+		   long iweight = visindex/freq_per_chan;
+                   for (long ipol=0; ipol<polarizations; ipol++)
+                   {
+                      double vistest = (double)vis_real[ifine];
+                      if (!isnan(vistest))
+                      {
+                         add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
+                         add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
+		      }
+                      ifine++;
+		      iweight++;
+                   }
+		}
+		atomicAdd(&(grid[iKer]),add_term_real);
+		atomicAdd(&(grid[iKer+1]),add_term_img);
+                */
+
+            }
+        }
+	}
+}
+#endif
+#ifdef ACCOMP
+#pragma  omp declare target
+#endif
+void degrid(
+     int num_w_planes,
+     long num_points,
+     long freq_per_chan,
+     long polarizations,
+     double* uu,
+     double* vv,
+     double* ww,
+     float* vis_real,
+     float* vis_img,
+     float* weight,
+     double dx,
+     double dw,
+     int w_support,
+     int grid_size_x,
+     int grid_size_y,
+     double* grid,
+     int num_threads)
+{
+    long i;
+    long index;
+    long visindex;
+
+    // initialize the convolution kernel
+    // gaussian:
+    int KernelLen = (w_support-1)/2;
+    double std = 1.0;
+    double std22 = 1.0/(2.0*std*std);
+    double norm = std22/PI;
+
+    // Loop over visibilities.
+// Switch between CUDA and GPU versions
+#ifdef __CUDACC__
+    // Define the CUDA set up
+    int Nth = NTHREADS;
+    long Nbl = (long)(num_points/Nth) + 1;
+    if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
+    long Nvis = num_points*freq_per_chan*polarizations;
+    printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
+
+    // Create GPU arrays and offload them
+    double * uu_g;
+    double * vv_g;
+    double * ww_g;
+    float * vis_real_g;
+    float * vis_img_g;
+    float * weight_g;
+    double * grid_g;
+
+    //for (int i=0; i<100000; i++)grid[i]=23.0;
+    cudaError_t mmm;
+    mmm=cudaMalloc(&uu_g,num_points*sizeof(double));
+    mmm=cudaMalloc(&vv_g,num_points*sizeof(double));
+    mmm=cudaMalloc(&ww_g,num_points*sizeof(double));
+    mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float));
+    mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
+    mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
+    mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+    mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+
+    mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice);
+
+    // Call main GPU Kernel
+    convolve_g <<<Nbl,Nth>>> (
+	       num_w_planes,
+               num_points,
+               freq_per_chan,
+               polarizations,
+               uu_g,
+               vv_g,
+               ww_g,
+               vis_real_g,
+               vis_img_g,
+               weight_g,
+               dx,
+               dw,
+               KernelLen,
+               grid_size_x,
+               grid_size_y,
+               grid_g,
+	       std22);
+
+    mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost);
+    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
+    printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));
+    mmm=cudaFree(uu_g);
+    mmm=cudaFree(vv_g);
+    mmm=cudaFree(ww_g);
+    mmm=cudaFree(vis_real_g);
+    mmm=cudaFree(vis_img_g);
+    mmm=cudaFree(weight_g);
+    mmm=cudaFree(grid_g);
+
+// Switch between CUDA and GPU versions
+# else
+
+#ifdef _OPENMP
+    omp_set_num_threads(num_threads);
+#endif
+
+#ifdef ACCOMP
+    long Nvis = num_points*freq_per_chan*polarizations;
+  //  #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan])
+  //  #pragma omp target teams distribute parallel for  map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y])
+#else
+    #pragma omp parallel for private(visindex)
+#endif
+    for (i = 0; i < num_points; i++)
+    {
+#ifdef _OPENMP
+	//int tid;
+	//tid = omp_get_thread_num();
+	//printf("%d\n",tid);
+#endif
+
+        visindex = i*freq_per_chan*polarizations;
+
+        double sum = 0.0;
+        int j, k;
+	//if (i%1000 == 0)printf("%ld\n",i);
+
+        /* Convert UV coordinates to grid coordinates. */
+        double pos_u = uu[i] / dx;
+        double pos_v = vv[i] / dx;
+        double ww_i  = ww[i] / dw;
+        int grid_w = (int)ww_i;
+        int grid_u = (int)pos_u;
+        int grid_v = (int)pos_v;
+
+	// check the boundaries
+	long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
+	long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
+	long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
+	long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
+        //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
+
+
+        // Convolve this point onto the grid.
+        for (k = kmin; k <= kmax; k++)
+        {
+
+            double v_dist = (double)k+0.5 - pos_v;
+
+            for (j = jmin; j <= jmax; j++)
+            {
+                double u_dist = (double)j+0.5 - pos_u;
+		long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+
+
+		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
+		// Loops over frequencies and polarizations
+		double add_term_real = 0.0;
+		double add_term_img = 0.0;
+		long ifine = visindex;
+                // FOR THE MOMENT ASSUME ONE FREQUENCY PER CHANNEL AND ONE POLARISAZION
+                // FIRST TERM IS CALCULATED, REMAINING TERMS ARE SKIPPED
+                // COMMENTED DOUBLE LOOP BELOW IS THE ORIGINAL GRIDDING ALGORITHM (FOR REFERENCE)
+                vis_real[ifine] += grid[iKer]*conv_weight;
+                vis_img[ifine] += grid[iKer]*conv_weight;
+
+                /*
+		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
+		for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
+		{
+		   long iweight = visindex/freq_per_chan;
+	           for (long ipol=0; ipol<polarizations; ipol++)
+	           {
+                      if (!isnan(vis_real[ifine]))
+                      {
+		         //printf("%f %ld\n",weight[iweight],iweight);
+                         add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
+		         add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
+			 //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
+		      }
+		      ifine++;
+		      iweight++;
+		   }
+	        }
+		// DAV: this is the critical call in terms of correctness of the results and of performance
+		#pragma omp atomic
+		grid[iKer] += add_term_real;
+		#pragma omp atomic
+		grid[iKer+1] += add_term_img;
+		*/
+            }
+        }
+
+    }
+// End switch between CUDA and CPU versions
+#endif
+    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
+}
diff --git a/degrid.h b/degrid.h
new file mode 100644
index 0000000..ace52e8
--- /dev/null
+++ b/degrid.h
@@ -0,0 +1,31 @@
+#ifndef W_DEGRID_H_
+#define W_DEGRID_H_
+
+#define NWORKERS -1    //100
+#define NTHREADS 32
+#define PI 3.14159265359
+#define REAL_TYPE double
+#ifdef __CUDACC__
+extern "C"
+#endif
+
+void degrid(
+     int,
+     long,
+     long,
+     long,
+     double*,
+     double*,
+     double*,
+     float*,
+     float*,
+     float*,
+     double,
+     double,
+     int,
+     int,
+     int,
+     double*,
+     int);
+
+#endif
diff --git a/inverse-imaging.c b/inverse-imaging.c
index d3e154d..0cadb53 100644
--- a/inverse-imaging.c
+++ b/inverse-imaging.c
@@ -17,7 +17,7 @@
 #ifdef ACCOMP
 #include "w-stacking_omp.h"
 #else
-#include "w-stacking.h"
+#include "degrid.h"
 #endif
 #define PI 3.14159265359
 #define NUM_OF_SECTORS -1
@@ -377,12 +377,12 @@ if(rank == 0){
 	long * inc = (long *) malloc(sizeof(long)*naxis);
 	int anynul;
 
-        fpixel[0] = 1;
-	lpixel[0] = xaxis;
-        inc[0] = 1;
-	fpixel[1] = rank*yaxis+1;
-	lpixel[1] = (rank+1)*yaxis+1;
+        fpixel[1] = 1;
+	lpixel[1] = xaxis;
         inc[1] = 1;
+	fpixel[0] = rank*yaxis+1;
+	lpixel[0] = (rank+1)*yaxis;
+        inc[0] = 1;
 	fits_read_subset(fptr, TDOUBLE, fpixel, lpixel, inc, NULL, image_real, &anynul, &status);
 	//fits_read_img(fptr, TDOUBLE, 1, xaxis*yaxis, NULL, image_real, &anynul, &status);
 	fits_close_file(fptr, &status);
@@ -407,13 +407,15 @@ if(rank == 0){
          
         fftw_plan plan;
         ptrdiff_t alloc_local, local_n0, local_0_start;
-	// FFTW double input variable whose size is = local_n0 x 2(Nx/2+1)
+	// FFTW double input variable whose size is = Ny x 2(Nx/2+1)
 	double *fftwimage;
-	// FFTW complex output variable whose size is = local_n0 x Nx/2 + 1
+	// FFTW complex output variable whose size is = Ny x (Nx/2+1)
 	fftw_complex *fftwgrid;
 
         alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD, &local_n0, &local_0_start);
         fftwimage = fftw_alloc_real(2 * alloc_local);
+	// this is equivalent to:
+        //fftwimage = (double*)calloc(yaxis*(2*(xaxis/2+1)),sizeof(double));
         fftwgrid = fftw_alloc_complex(alloc_local);
 
 	// Create plan for r2c DFT
@@ -423,14 +425,16 @@ if(rank == 0){
         // Initialize FFTW arrays
         long fftwindex = 0;
         long fftwindex2D = 0;
-	for (long iv = 0; iv < local_n0; iv++)
-	for (long iu = 0; iu < xaxis; iu++)
+        //for (long iv = 0; iv < yaxis; iv++)
+        for (long iv = 0; iv < local_n0; iv++)
+        for (long iu = 0; iu < xaxis; iu++)
         {
-            fftwindex2D = iu + iv*xaxis;
-	    fftwimage[fftwindex2D] = image_real[fftwindex2D];
-	    fftwgrid[fftwindex2D][0] = 0.0;
-	    fftwgrid[fftwindex2D][1] = 0.0;
-	}
+            fftwindex2D = iu + iv*(2*(xaxis/2+1));
+            fftwimage[fftwindex2D] = image_real[fftwindex];
+            //fftwgrid[fftwindex2D][0] = 0.0;
+            //fftwgrid[fftwindex2D][1] = 0.0;
+	    fftwindex++;
+        }
 
         // Perform the FFT
         fftw_execute(plan);
@@ -441,44 +445,31 @@ if(rank == 0){
 	double * grid;
 	long size_of_grid = 2*num_w_planes*xaxis*yaxis;
         grid = (double*) calloc(size_of_grid,sizeof(double));
+	fftwindex = 0;
+	long fftwindex2;
         for (int iw=0; iw<num_w_planes; iw++)
         {
             for (int iv=0; iv<yaxis; iv++)
             {
-               for (int iu=0; iu<xaxis; iu++)
+               for (int iu=0; iu<xaxis/2; iu++)
                {
-                   fftwindex2D = iu + iv*xaxis;
-                   fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
-                   grid[fftwindex] = fftwgrid[fftwindex2D][0];
-                   grid[fftwindex+1] = fftwgrid[fftwindex2D][1];
+                   fftwindex2D = iu + iv*(xaxis/2+1);
+		   fftwindex2 = 2*(fftwindex);
+                   //fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
+                   grid[fftwindex2] = fftwgrid[fftwindex2D][0];
+                   grid[fftwindex2+1] = fftwgrid[fftwindex2D][1];
                    #ifdef WRITE_DATA
-		   grid_real[fftwindex2D] = fftwgrid[fftwindex2D][0];
-		   grid_img[fftwindex2D] = fftwgrid[fftwindex2D][1];
+		   grid_real[fftwindex] = fftwgrid[fftwindex2D][0];
+		   grid_img[fftwindex] = fftwgrid[fftwindex2D][1];
                    #endif
+		   fftwindex++;
                }
             }
         }
 	fftw_free(fftwgrid);
-	fftw_free(fftwimage);
+	free(fftwimage);
 
 
-/*	
-	// This may be moved inside the WRITE_DATA section: TO BE CHECKED
-        // Create sector grid
-        double * gridss;
-        double * gridss_w;
-        double * gridss_real;
-        double * gridss_img;
-        size_of_grid = 2*num_w_planes*xaxis*yaxis;
-        gridss = (double*) calloc(size_of_grid,sizeof(double));
-        gridss_w = (double*) calloc(size_of_grid,sizeof(double));
-        gridss_real = (double*) calloc(size_of_grid/2,sizeof(double));
-        gridss_img = (double*) calloc(size_of_grid/2,sizeof(double));
-        // Create temporary global grid
-#ifndef USE_MPI
-        double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
-#endif
-*/
 
         // Open the MPI Memory Window for the slab
 #ifdef USE_MPI
@@ -528,7 +519,7 @@ if(rank == 0){
         #endif
 
 #endif //WRITE_DATA
-	exit(3);
+	//exit(3);
 
         if(rank == 0)printf("CREATING LINKED LISTS\n");
 
@@ -669,11 +660,12 @@ if(rank == 0){
 	compose_time1 += (finishk.tv_sec - begink.tv_sec);
 	compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
 
-    #ifndef USE_MPI
+        //#ifndef USE_MPI
 	double uumin = 1e20;
 	double vvmin = 1e20;
 	double uumax = -1e20;
 	double vvmax = -1e20;
+	pFile = fopen (outfile1,"w");
 
         for (long ipart=0; ipart<Nsec; ipart++)
         {
@@ -687,20 +679,38 @@ if(rank == 0){
         }
 
 	printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
-        #endif
+	fclose(pFile);
+	printf("============> 3\n");
+        //#endif
 
 
-#ifdef STOPHERE
 
 
-        // Make convolution on the grid
+        // Make degriddig on the visibilities
         #ifdef VERBOSE
 	printf("Processing sector %ld\n",isector);
 	#endif
         clock_gettime(CLOCK_MONOTONIC, &begink);
         startk = clock();
+/*	
+        // Create sector grid
+        double * gridss_w;
+        double * gridss_real;
+        double * gridss_img;
+        size_of_grid = 2*num_w_planes*xaxis*yaxis;
+        gridss_w = (double*) calloc(size_of_grid,sizeof(double));
+        gridss_real = (double*) calloc(size_of_grid/2,sizeof(double));
+        gridss_img = (double*) calloc(size_of_grid/2,sizeof(double));
+        // Create temporary global grid
+#ifndef USE_MPI
+        double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
+#endif
+*/
 
-        wstack(num_w_planes,
+        double * gridss;
+        gridss = (double*) calloc(size_of_grid,sizeof(double));
+
+        degrid(num_w_planes,
                Nsec,
                freq_per_chan,
                polarisations,
@@ -718,6 +728,7 @@ if(rank == 0){
                gridss,
                num_threads);
 
+#ifdef STOPHERE
 /*
 int z =0 ;
 #pragma omp target map(to:test_i_gpu) map(from:z)
diff --git a/w-stacking.cu b/w-stacking.cu
index 4fe3625..5845cd9 100644
--- a/w-stacking.cu
+++ b/w-stacking.cu
@@ -1,25 +1,12 @@
 #ifdef _OPENMP
 #include <omp.h>
 #endif
+#include "convkernels.h"
 #include "w-stacking.h"
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
 
-#ifdef ACCOMP
-#pragma omp  declare target
-#endif
-#ifdef __CUDACC__
-double __device__
-#else
-double
-#endif
-gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
-{
-     double conv_weight;
-     conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
-     return conv_weight;
-}
 #ifdef ACCOMP
 #pragma omp end declare target
 #endif
-- 
GitLab