From 15e3f465b28ceecf0961ba937db63ad556278882 Mon Sep 17 00:00:00 2001 From: Claudio Gheller <cgheller@login01.m100.cineca.it> Date: Mon, 2 Jan 2023 10:24:26 +0100 Subject: [PATCH] new degrid function introduced --- .gitignore | 1 + Makefile | 13 +- convkernels.c | 20 +++ convkernels.cu | 20 +++ convkernels.h | 13 ++ degrid.c | 314 ++++++++++++++++++++++++++++++++++++++++++++++ degrid.cu | 314 ++++++++++++++++++++++++++++++++++++++++++++++ degrid.h | 31 +++++ inverse-imaging.c | 103 ++++++++------- w-stacking.cu | 15 +-- 10 files changed, 781 insertions(+), 63 deletions(-) create mode 100644 convkernels.c create mode 100644 convkernels.cu create mode 100644 convkernels.h create mode 100644 degrid.c create mode 100644 degrid.cu create mode 100644 degrid.h diff --git a/.gitignore b/.gitignore index 8edc32e..c1ad20c 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ w-stackingCfftw_serial w-stackingfftw_serial inverse-imaging .* +*.tar diff --git a/Makefile b/Makefile index 93ab885..5477aaf 100644 --- a/Makefile +++ b/Makefile @@ -42,12 +42,18 @@ ifeq (FITSIO,$(findstring FITSIO,$(OPT))) endif -DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu -COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o +DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu degrid.cu convkernels.cu +COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o degrid.o convkernels.o + +convkernels.c: convkernels.cu + cp convkernels.cu convkernels.c w-stacking.c: w-stacking.cu cp w-stacking.cu w-stacking.c +degrid.c: degrid.cu + cp degrid.cu degrid.c + phase_correction.c: phase_correction.cu cp phase_correction.cu phase_correction.c @@ -78,7 +84,7 @@ serial_cuda: mpi: $(COBJ) $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw $^ $(CFLAGS) $(LIBS) - $(MPICC) $(OPTIMIZE) $(OPT) -o inverse-imaging inverse-imaging.c w-stacking.c $(CFLAGS) $(LIBS) + $(MPICC) $(OPTIMIZE) $(OPT) -o inverse-imaging inverse-imaging.c degrid.c convkernels.c $(CFLAGS) $(LIBS) mpi_cuda: $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) @@ -89,3 +95,4 @@ clean: rm *.o rm w-stacking.c rm phase_correction.c + rm degrid.c diff --git a/convkernels.c b/convkernels.c new file mode 100644 index 0000000..e06872e --- /dev/null +++ b/convkernels.c @@ -0,0 +1,20 @@ +#include "convkernels.h" + +#include <math.h> +#include <stdlib.h> +#include <stdio.h> + +#ifdef ACCOMP +#pragma omp declare target +#endif +#ifdef __CUDACC__ +double __device__ +#else +double +#endif +gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) +{ + double conv_weight; + conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); + return conv_weight; +} diff --git a/convkernels.cu b/convkernels.cu new file mode 100644 index 0000000..e06872e --- /dev/null +++ b/convkernels.cu @@ -0,0 +1,20 @@ +#include "convkernels.h" + +#include <math.h> +#include <stdlib.h> +#include <stdio.h> + +#ifdef ACCOMP +#pragma omp declare target +#endif +#ifdef __CUDACC__ +double __device__ +#else +double +#endif +gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) +{ + double conv_weight; + conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); + return conv_weight; +} diff --git a/convkernels.h b/convkernels.h new file mode 100644 index 0000000..8194467 --- /dev/null +++ b/convkernels.h @@ -0,0 +1,13 @@ +#ifndef W_KERNELS +#define W_KERNELS + +#ifdef ACCOMP +#pragma omp declare target +#endif +#ifdef __CUDACC__ +double __device__ +#else +double +#endif +gauss_kernel_norm(double, double, double, double); +#endif diff --git a/degrid.c b/degrid.c new file mode 100644 index 0000000..87dca3d --- /dev/null +++ b/degrid.c @@ -0,0 +1,314 @@ +#ifdef _OPENMP +#include <omp.h> +#endif +#include "convkernels.h" +#include "degrid.h" +#include <math.h> +#include <stdlib.h> +#include <stdio.h> + +#ifdef ACCOMP +#pragma omp end declare target +#endif + +#ifdef __CUDACC__ +//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) +//{ +// double conv_weight; +// conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); +// return conv_weight; +//} + +__global__ void convolve_g( + int num_w_planes, + long num_points, + long freq_per_chan, + long polarizations, + double* uu, + double* vv, + double* ww, + float* vis_real, + float* vis_img, + float* weight, + double dx, + double dw, + int KernelLen, + int grid_size_x, + int grid_size_y, + double* grid, + double std22) + +{ + long gid = blockIdx.x*blockDim.x + threadIdx.x; + if(gid < num_points) + { + long i = gid; + long visindex = i*freq_per_chan*polarizations; + double norm = std22/PI; + + int j, k; + + /* Convert UV coordinates to grid coordinates. */ + double pos_u = uu[i] / dx; + double pos_v = vv[i] / dx; + double ww_i = ww[i] / dw; + int grid_w = (int)ww_i; + int grid_u = (int)pos_u; + int grid_v = (int)pos_v; + + // check the boundaries + long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; + long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; + long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; + long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; + //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax); + + + // Convolve this point onto the grid. + for (k = kmin; k <= kmax; k++) + { + + double v_dist = (double)k+0.5 - pos_v; + + for (j = jmin; j <= jmax; j++) + { + double u_dist = (double)j+0.5 - pos_u; + long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); + //printf("--> %ld %d %d %d\n",iKer,j,k,grid_w); + + double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); + // Loops over frequencies and polarizations + double add_term_real = 0.0; + double add_term_img = 0.0; + long ifine = visindex; + // FOR THE MOMENT ASSUME ONE FREQUENCY PER CHANNEL AND ONE POLARISAZION + // FIRST TERM IS CALCULATED, REMAINING TERMS ARE SKIPPED + // COMMENTED DOUBLE LOOP BELOW IS THE ORIGINAL GRIDDING ALGORITHM (FOR REFERENCE) + vis_real[ifine] += grid[iKer]*conv_weight; + vis_img[ifine] += grid[iKer]*conv_weight; + + /* + for (long ifreq=0; ifreq<freq_per_chan; ifreq++) + { + long iweight = visindex/freq_per_chan; + for (long ipol=0; ipol<polarizations; ipol++) + { + double vistest = (double)vis_real[ifine]; + if (!isnan(vistest)) + { + add_term_real += weight[iweight] * vis_real[ifine] * conv_weight; + add_term_img += weight[iweight] * vis_img[ifine] * conv_weight; + } + ifine++; + iweight++; + } + } + atomicAdd(&(grid[iKer]),add_term_real); + atomicAdd(&(grid[iKer+1]),add_term_img); + */ + + } + } + } +} +#endif +#ifdef ACCOMP +#pragma omp declare target +#endif +void degrid( + int num_w_planes, + long num_points, + long freq_per_chan, + long polarizations, + double* uu, + double* vv, + double* ww, + float* vis_real, + float* vis_img, + float* weight, + double dx, + double dw, + int w_support, + int grid_size_x, + int grid_size_y, + double* grid, + int num_threads) +{ + long i; + long index; + long visindex; + + // initialize the convolution kernel + // gaussian: + int KernelLen = (w_support-1)/2; + double std = 1.0; + double std22 = 1.0/(2.0*std*std); + double norm = std22/PI; + + // Loop over visibilities. +// Switch between CUDA and GPU versions +#ifdef __CUDACC__ + // Define the CUDA set up + int Nth = NTHREADS; + long Nbl = (long)(num_points/Nth) + 1; + if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; + long Nvis = num_points*freq_per_chan*polarizations; + printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl); + + // Create GPU arrays and offload them + double * uu_g; + double * vv_g; + double * ww_g; + float * vis_real_g; + float * vis_img_g; + float * weight_g; + double * grid_g; + + //for (int i=0; i<100000; i++)grid[i]=23.0; + cudaError_t mmm; + mmm=cudaMalloc(&uu_g,num_points*sizeof(double)); + mmm=cudaMalloc(&vv_g,num_points*sizeof(double)); + mmm=cudaMalloc(&ww_g,num_points*sizeof(double)); + mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float)); + mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float)); + mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float)); + mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); + mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); + + mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice); + mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice); + mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice); + mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice); + mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice); + mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice); + + // Call main GPU Kernel + convolve_g <<<Nbl,Nth>>> ( + num_w_planes, + num_points, + freq_per_chan, + polarizations, + uu_g, + vv_g, + ww_g, + vis_real_g, + vis_img_g, + weight_g, + dx, + dw, + KernelLen, + grid_size_x, + grid_size_y, + grid_g, + std22); + + mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost); + //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); + printf("CUDA ERROR %s\n",cudaGetErrorString(mmm)); + mmm=cudaFree(uu_g); + mmm=cudaFree(vv_g); + mmm=cudaFree(ww_g); + mmm=cudaFree(vis_real_g); + mmm=cudaFree(vis_img_g); + mmm=cudaFree(weight_g); + mmm=cudaFree(grid_g); + +// Switch between CUDA and GPU versions +# else + +#ifdef _OPENMP + omp_set_num_threads(num_threads); +#endif + +#ifdef ACCOMP + long Nvis = num_points*freq_per_chan*polarizations; + // #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) + // #pragma omp target teams distribute parallel for map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y]) +#else + #pragma omp parallel for private(visindex) +#endif + for (i = 0; i < num_points; i++) + { +#ifdef _OPENMP + //int tid; + //tid = omp_get_thread_num(); + //printf("%d\n",tid); +#endif + + visindex = i*freq_per_chan*polarizations; + + double sum = 0.0; + int j, k; + //if (i%1000 == 0)printf("%ld\n",i); + + /* Convert UV coordinates to grid coordinates. */ + double pos_u = uu[i] / dx; + double pos_v = vv[i] / dx; + double ww_i = ww[i] / dw; + int grid_w = (int)ww_i; + int grid_u = (int)pos_u; + int grid_v = (int)pos_v; + + // check the boundaries + long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; + long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; + long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; + long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; + //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax); + + + // Convolve this point onto the grid. + for (k = kmin; k <= kmax; k++) + { + + double v_dist = (double)k+0.5 - pos_v; + + for (j = jmin; j <= jmax; j++) + { + double u_dist = (double)j+0.5 - pos_u; + long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); + + + double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); + // Loops over frequencies and polarizations + double add_term_real = 0.0; + double add_term_img = 0.0; + long ifine = visindex; + // FOR THE MOMENT ASSUME ONE FREQUENCY PER CHANNEL AND ONE POLARISAZION + // FIRST TERM IS CALCULATED, REMAINING TERMS ARE SKIPPED + // COMMENTED DOUBLE LOOP BELOW IS THE ORIGINAL GRIDDING ALGORITHM (FOR REFERENCE) + vis_real[ifine] += grid[iKer]*conv_weight; + vis_img[ifine] += grid[iKer]*conv_weight; + + /* + // DAV: the following two loops are performend by each thread separately: no problems of race conditions + for (long ifreq=0; ifreq<freq_per_chan; ifreq++) + { + long iweight = visindex/freq_per_chan; + for (long ipol=0; ipol<polarizations; ipol++) + { + if (!isnan(vis_real[ifine])) + { + //printf("%f %ld\n",weight[iweight],iweight); + add_term_real += weight[iweight] * vis_real[ifine] * conv_weight; + add_term_img += weight[iweight] * vis_img[ifine] * conv_weight; + //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations); + } + ifine++; + iweight++; + } + } + // DAV: this is the critical call in terms of correctness of the results and of performance + #pragma omp atomic + grid[iKer] += add_term_real; + #pragma omp atomic + grid[iKer+1] += add_term_img; + */ + } + } + + } +// End switch between CUDA and CPU versions +#endif + //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); +} diff --git a/degrid.cu b/degrid.cu new file mode 100644 index 0000000..87dca3d --- /dev/null +++ b/degrid.cu @@ -0,0 +1,314 @@ +#ifdef _OPENMP +#include <omp.h> +#endif +#include "convkernels.h" +#include "degrid.h" +#include <math.h> +#include <stdlib.h> +#include <stdio.h> + +#ifdef ACCOMP +#pragma omp end declare target +#endif + +#ifdef __CUDACC__ +//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) +//{ +// double conv_weight; +// conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); +// return conv_weight; +//} + +__global__ void convolve_g( + int num_w_planes, + long num_points, + long freq_per_chan, + long polarizations, + double* uu, + double* vv, + double* ww, + float* vis_real, + float* vis_img, + float* weight, + double dx, + double dw, + int KernelLen, + int grid_size_x, + int grid_size_y, + double* grid, + double std22) + +{ + long gid = blockIdx.x*blockDim.x + threadIdx.x; + if(gid < num_points) + { + long i = gid; + long visindex = i*freq_per_chan*polarizations; + double norm = std22/PI; + + int j, k; + + /* Convert UV coordinates to grid coordinates. */ + double pos_u = uu[i] / dx; + double pos_v = vv[i] / dx; + double ww_i = ww[i] / dw; + int grid_w = (int)ww_i; + int grid_u = (int)pos_u; + int grid_v = (int)pos_v; + + // check the boundaries + long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; + long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; + long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; + long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; + //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax); + + + // Convolve this point onto the grid. + for (k = kmin; k <= kmax; k++) + { + + double v_dist = (double)k+0.5 - pos_v; + + for (j = jmin; j <= jmax; j++) + { + double u_dist = (double)j+0.5 - pos_u; + long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); + //printf("--> %ld %d %d %d\n",iKer,j,k,grid_w); + + double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); + // Loops over frequencies and polarizations + double add_term_real = 0.0; + double add_term_img = 0.0; + long ifine = visindex; + // FOR THE MOMENT ASSUME ONE FREQUENCY PER CHANNEL AND ONE POLARISAZION + // FIRST TERM IS CALCULATED, REMAINING TERMS ARE SKIPPED + // COMMENTED DOUBLE LOOP BELOW IS THE ORIGINAL GRIDDING ALGORITHM (FOR REFERENCE) + vis_real[ifine] += grid[iKer]*conv_weight; + vis_img[ifine] += grid[iKer]*conv_weight; + + /* + for (long ifreq=0; ifreq<freq_per_chan; ifreq++) + { + long iweight = visindex/freq_per_chan; + for (long ipol=0; ipol<polarizations; ipol++) + { + double vistest = (double)vis_real[ifine]; + if (!isnan(vistest)) + { + add_term_real += weight[iweight] * vis_real[ifine] * conv_weight; + add_term_img += weight[iweight] * vis_img[ifine] * conv_weight; + } + ifine++; + iweight++; + } + } + atomicAdd(&(grid[iKer]),add_term_real); + atomicAdd(&(grid[iKer+1]),add_term_img); + */ + + } + } + } +} +#endif +#ifdef ACCOMP +#pragma omp declare target +#endif +void degrid( + int num_w_planes, + long num_points, + long freq_per_chan, + long polarizations, + double* uu, + double* vv, + double* ww, + float* vis_real, + float* vis_img, + float* weight, + double dx, + double dw, + int w_support, + int grid_size_x, + int grid_size_y, + double* grid, + int num_threads) +{ + long i; + long index; + long visindex; + + // initialize the convolution kernel + // gaussian: + int KernelLen = (w_support-1)/2; + double std = 1.0; + double std22 = 1.0/(2.0*std*std); + double norm = std22/PI; + + // Loop over visibilities. +// Switch between CUDA and GPU versions +#ifdef __CUDACC__ + // Define the CUDA set up + int Nth = NTHREADS; + long Nbl = (long)(num_points/Nth) + 1; + if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; + long Nvis = num_points*freq_per_chan*polarizations; + printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl); + + // Create GPU arrays and offload them + double * uu_g; + double * vv_g; + double * ww_g; + float * vis_real_g; + float * vis_img_g; + float * weight_g; + double * grid_g; + + //for (int i=0; i<100000; i++)grid[i]=23.0; + cudaError_t mmm; + mmm=cudaMalloc(&uu_g,num_points*sizeof(double)); + mmm=cudaMalloc(&vv_g,num_points*sizeof(double)); + mmm=cudaMalloc(&ww_g,num_points*sizeof(double)); + mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float)); + mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float)); + mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float)); + mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); + mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); + + mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice); + mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice); + mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice); + mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice); + mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice); + mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice); + + // Call main GPU Kernel + convolve_g <<<Nbl,Nth>>> ( + num_w_planes, + num_points, + freq_per_chan, + polarizations, + uu_g, + vv_g, + ww_g, + vis_real_g, + vis_img_g, + weight_g, + dx, + dw, + KernelLen, + grid_size_x, + grid_size_y, + grid_g, + std22); + + mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost); + //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); + printf("CUDA ERROR %s\n",cudaGetErrorString(mmm)); + mmm=cudaFree(uu_g); + mmm=cudaFree(vv_g); + mmm=cudaFree(ww_g); + mmm=cudaFree(vis_real_g); + mmm=cudaFree(vis_img_g); + mmm=cudaFree(weight_g); + mmm=cudaFree(grid_g); + +// Switch between CUDA and GPU versions +# else + +#ifdef _OPENMP + omp_set_num_threads(num_threads); +#endif + +#ifdef ACCOMP + long Nvis = num_points*freq_per_chan*polarizations; + // #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) + // #pragma omp target teams distribute parallel for map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y]) +#else + #pragma omp parallel for private(visindex) +#endif + for (i = 0; i < num_points; i++) + { +#ifdef _OPENMP + //int tid; + //tid = omp_get_thread_num(); + //printf("%d\n",tid); +#endif + + visindex = i*freq_per_chan*polarizations; + + double sum = 0.0; + int j, k; + //if (i%1000 == 0)printf("%ld\n",i); + + /* Convert UV coordinates to grid coordinates. */ + double pos_u = uu[i] / dx; + double pos_v = vv[i] / dx; + double ww_i = ww[i] / dw; + int grid_w = (int)ww_i; + int grid_u = (int)pos_u; + int grid_v = (int)pos_v; + + // check the boundaries + long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; + long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; + long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; + long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; + //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax); + + + // Convolve this point onto the grid. + for (k = kmin; k <= kmax; k++) + { + + double v_dist = (double)k+0.5 - pos_v; + + for (j = jmin; j <= jmax; j++) + { + double u_dist = (double)j+0.5 - pos_u; + long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); + + + double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); + // Loops over frequencies and polarizations + double add_term_real = 0.0; + double add_term_img = 0.0; + long ifine = visindex; + // FOR THE MOMENT ASSUME ONE FREQUENCY PER CHANNEL AND ONE POLARISAZION + // FIRST TERM IS CALCULATED, REMAINING TERMS ARE SKIPPED + // COMMENTED DOUBLE LOOP BELOW IS THE ORIGINAL GRIDDING ALGORITHM (FOR REFERENCE) + vis_real[ifine] += grid[iKer]*conv_weight; + vis_img[ifine] += grid[iKer]*conv_weight; + + /* + // DAV: the following two loops are performend by each thread separately: no problems of race conditions + for (long ifreq=0; ifreq<freq_per_chan; ifreq++) + { + long iweight = visindex/freq_per_chan; + for (long ipol=0; ipol<polarizations; ipol++) + { + if (!isnan(vis_real[ifine])) + { + //printf("%f %ld\n",weight[iweight],iweight); + add_term_real += weight[iweight] * vis_real[ifine] * conv_weight; + add_term_img += weight[iweight] * vis_img[ifine] * conv_weight; + //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations); + } + ifine++; + iweight++; + } + } + // DAV: this is the critical call in terms of correctness of the results and of performance + #pragma omp atomic + grid[iKer] += add_term_real; + #pragma omp atomic + grid[iKer+1] += add_term_img; + */ + } + } + + } +// End switch between CUDA and CPU versions +#endif + //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); +} diff --git a/degrid.h b/degrid.h new file mode 100644 index 0000000..ace52e8 --- /dev/null +++ b/degrid.h @@ -0,0 +1,31 @@ +#ifndef W_DEGRID_H_ +#define W_DEGRID_H_ + +#define NWORKERS -1 //100 +#define NTHREADS 32 +#define PI 3.14159265359 +#define REAL_TYPE double +#ifdef __CUDACC__ +extern "C" +#endif + +void degrid( + int, + long, + long, + long, + double*, + double*, + double*, + float*, + float*, + float*, + double, + double, + int, + int, + int, + double*, + int); + +#endif diff --git a/inverse-imaging.c b/inverse-imaging.c index d3e154d..0cadb53 100644 --- a/inverse-imaging.c +++ b/inverse-imaging.c @@ -17,7 +17,7 @@ #ifdef ACCOMP #include "w-stacking_omp.h" #else -#include "w-stacking.h" +#include "degrid.h" #endif #define PI 3.14159265359 #define NUM_OF_SECTORS -1 @@ -377,12 +377,12 @@ if(rank == 0){ long * inc = (long *) malloc(sizeof(long)*naxis); int anynul; - fpixel[0] = 1; - lpixel[0] = xaxis; - inc[0] = 1; - fpixel[1] = rank*yaxis+1; - lpixel[1] = (rank+1)*yaxis+1; + fpixel[1] = 1; + lpixel[1] = xaxis; inc[1] = 1; + fpixel[0] = rank*yaxis+1; + lpixel[0] = (rank+1)*yaxis; + inc[0] = 1; fits_read_subset(fptr, TDOUBLE, fpixel, lpixel, inc, NULL, image_real, &anynul, &status); //fits_read_img(fptr, TDOUBLE, 1, xaxis*yaxis, NULL, image_real, &anynul, &status); fits_close_file(fptr, &status); @@ -407,13 +407,15 @@ if(rank == 0){ fftw_plan plan; ptrdiff_t alloc_local, local_n0, local_0_start; - // FFTW double input variable whose size is = local_n0 x 2(Nx/2+1) + // FFTW double input variable whose size is = Ny x 2(Nx/2+1) double *fftwimage; - // FFTW complex output variable whose size is = local_n0 x Nx/2 + 1 + // FFTW complex output variable whose size is = Ny x (Nx/2+1) fftw_complex *fftwgrid; alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD, &local_n0, &local_0_start); fftwimage = fftw_alloc_real(2 * alloc_local); + // this is equivalent to: + //fftwimage = (double*)calloc(yaxis*(2*(xaxis/2+1)),sizeof(double)); fftwgrid = fftw_alloc_complex(alloc_local); // Create plan for r2c DFT @@ -423,14 +425,16 @@ if(rank == 0){ // Initialize FFTW arrays long fftwindex = 0; long fftwindex2D = 0; - for (long iv = 0; iv < local_n0; iv++) - for (long iu = 0; iu < xaxis; iu++) + //for (long iv = 0; iv < yaxis; iv++) + for (long iv = 0; iv < local_n0; iv++) + for (long iu = 0; iu < xaxis; iu++) { - fftwindex2D = iu + iv*xaxis; - fftwimage[fftwindex2D] = image_real[fftwindex2D]; - fftwgrid[fftwindex2D][0] = 0.0; - fftwgrid[fftwindex2D][1] = 0.0; - } + fftwindex2D = iu + iv*(2*(xaxis/2+1)); + fftwimage[fftwindex2D] = image_real[fftwindex]; + //fftwgrid[fftwindex2D][0] = 0.0; + //fftwgrid[fftwindex2D][1] = 0.0; + fftwindex++; + } // Perform the FFT fftw_execute(plan); @@ -441,44 +445,31 @@ if(rank == 0){ double * grid; long size_of_grid = 2*num_w_planes*xaxis*yaxis; grid = (double*) calloc(size_of_grid,sizeof(double)); + fftwindex = 0; + long fftwindex2; for (int iw=0; iw<num_w_planes; iw++) { for (int iv=0; iv<yaxis; iv++) { - for (int iu=0; iu<xaxis; iu++) + for (int iu=0; iu<xaxis/2; iu++) { - fftwindex2D = iu + iv*xaxis; - fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); - grid[fftwindex] = fftwgrid[fftwindex2D][0]; - grid[fftwindex+1] = fftwgrid[fftwindex2D][1]; + fftwindex2D = iu + iv*(xaxis/2+1); + fftwindex2 = 2*(fftwindex); + //fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); + grid[fftwindex2] = fftwgrid[fftwindex2D][0]; + grid[fftwindex2+1] = fftwgrid[fftwindex2D][1]; #ifdef WRITE_DATA - grid_real[fftwindex2D] = fftwgrid[fftwindex2D][0]; - grid_img[fftwindex2D] = fftwgrid[fftwindex2D][1]; + grid_real[fftwindex] = fftwgrid[fftwindex2D][0]; + grid_img[fftwindex] = fftwgrid[fftwindex2D][1]; #endif + fftwindex++; } } } fftw_free(fftwgrid); - fftw_free(fftwimage); + free(fftwimage); -/* - // This may be moved inside the WRITE_DATA section: TO BE CHECKED - // Create sector grid - double * gridss; - double * gridss_w; - double * gridss_real; - double * gridss_img; - size_of_grid = 2*num_w_planes*xaxis*yaxis; - gridss = (double*) calloc(size_of_grid,sizeof(double)); - gridss_w = (double*) calloc(size_of_grid,sizeof(double)); - gridss_real = (double*) calloc(size_of_grid/2,sizeof(double)); - gridss_img = (double*) calloc(size_of_grid/2,sizeof(double)); - // Create temporary global grid -#ifndef USE_MPI - double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double)); -#endif -*/ // Open the MPI Memory Window for the slab #ifdef USE_MPI @@ -528,7 +519,7 @@ if(rank == 0){ #endif #endif //WRITE_DATA - exit(3); + //exit(3); if(rank == 0)printf("CREATING LINKED LISTS\n"); @@ -669,11 +660,12 @@ if(rank == 0){ compose_time1 += (finishk.tv_sec - begink.tv_sec); compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; - #ifndef USE_MPI + //#ifndef USE_MPI double uumin = 1e20; double vvmin = 1e20; double uumax = -1e20; double vvmax = -1e20; + pFile = fopen (outfile1,"w"); for (long ipart=0; ipart<Nsec; ipart++) { @@ -687,20 +679,38 @@ if(rank == 0){ } printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); - #endif + fclose(pFile); + printf("============> 3\n"); + //#endif -#ifdef STOPHERE - // Make convolution on the grid + // Make degriddig on the visibilities #ifdef VERBOSE printf("Processing sector %ld\n",isector); #endif clock_gettime(CLOCK_MONOTONIC, &begink); startk = clock(); +/* + // Create sector grid + double * gridss_w; + double * gridss_real; + double * gridss_img; + size_of_grid = 2*num_w_planes*xaxis*yaxis; + gridss_w = (double*) calloc(size_of_grid,sizeof(double)); + gridss_real = (double*) calloc(size_of_grid/2,sizeof(double)); + gridss_img = (double*) calloc(size_of_grid/2,sizeof(double)); + // Create temporary global grid +#ifndef USE_MPI + double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double)); +#endif +*/ - wstack(num_w_planes, + double * gridss; + gridss = (double*) calloc(size_of_grid,sizeof(double)); + + degrid(num_w_planes, Nsec, freq_per_chan, polarisations, @@ -718,6 +728,7 @@ if(rank == 0){ gridss, num_threads); +#ifdef STOPHERE /* int z =0 ; #pragma omp target map(to:test_i_gpu) map(from:z) diff --git a/w-stacking.cu b/w-stacking.cu index 4fe3625..5845cd9 100644 --- a/w-stacking.cu +++ b/w-stacking.cu @@ -1,25 +1,12 @@ #ifdef _OPENMP #include <omp.h> #endif +#include "convkernels.h" #include "w-stacking.h" #include <math.h> #include <stdlib.h> #include <stdio.h> -#ifdef ACCOMP -#pragma omp declare target -#endif -#ifdef __CUDACC__ -double __device__ -#else -double -#endif -gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) -{ - double conv_weight; - conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); - return conv_weight; -} #ifdef ACCOMP #pragma omp end declare target #endif -- GitLab