Skip to content
Snippets Groups Projects
Commit 15e3f465 authored by Claudio Gheller's avatar Claudio Gheller
Browse files

new degrid function introduced

parent 733cebcf
No related branches found
No related tags found
No related merge requests found
......@@ -9,3 +9,4 @@ w-stackingCfftw_serial
w-stackingfftw_serial
inverse-imaging
.*
*.tar
......@@ -42,12 +42,18 @@ ifeq (FITSIO,$(findstring FITSIO,$(OPT)))
endif
DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu
COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o
DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu degrid.cu convkernels.cu
COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o degrid.o convkernels.o
convkernels.c: convkernels.cu
cp convkernels.cu convkernels.c
w-stacking.c: w-stacking.cu
cp w-stacking.cu w-stacking.c
degrid.c: degrid.cu
cp degrid.cu degrid.c
phase_correction.c: phase_correction.cu
cp phase_correction.cu phase_correction.c
......@@ -78,7 +84,7 @@ serial_cuda:
mpi: $(COBJ)
$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw $^ $(CFLAGS) $(LIBS)
$(MPICC) $(OPTIMIZE) $(OPT) -o inverse-imaging inverse-imaging.c w-stacking.c $(CFLAGS) $(LIBS)
$(MPICC) $(OPTIMIZE) $(OPT) -o inverse-imaging inverse-imaging.c degrid.c convkernels.c $(CFLAGS) $(LIBS)
mpi_cuda:
$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
......@@ -89,3 +95,4 @@ clean:
rm *.o
rm w-stacking.c
rm phase_correction.c
rm degrid.c
#include "convkernels.h"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#ifdef ACCOMP
#pragma omp declare target
#endif
#ifdef __CUDACC__
double __device__
#else
double
#endif
gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
{
double conv_weight;
conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
return conv_weight;
}
#include "convkernels.h"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#ifdef ACCOMP
#pragma omp declare target
#endif
#ifdef __CUDACC__
double __device__
#else
double
#endif
gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
{
double conv_weight;
conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
return conv_weight;
}
#ifndef W_KERNELS
#define W_KERNELS
#ifdef ACCOMP
#pragma omp declare target
#endif
#ifdef __CUDACC__
double __device__
#else
double
#endif
gauss_kernel_norm(double, double, double, double);
#endif
degrid.c 0 → 100644
#ifdef _OPENMP
#include <omp.h>
#endif
#include "convkernels.h"
#include "degrid.h"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#ifdef ACCOMP
#pragma omp end declare target
#endif
#ifdef __CUDACC__
//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
//{
// double conv_weight;
// conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
// return conv_weight;
//}
__global__ void convolve_g(
int num_w_planes,
long num_points,
long freq_per_chan,
long polarizations,
double* uu,
double* vv,
double* ww,
float* vis_real,
float* vis_img,
float* weight,
double dx,
double dw,
int KernelLen,
int grid_size_x,
int grid_size_y,
double* grid,
double std22)
{
long gid = blockIdx.x*blockDim.x + threadIdx.x;
if(gid < num_points)
{
long i = gid;
long visindex = i*freq_per_chan*polarizations;
double norm = std22/PI;
int j, k;
/* Convert UV coordinates to grid coordinates. */
double pos_u = uu[i] / dx;
double pos_v = vv[i] / dx;
double ww_i = ww[i] / dw;
int grid_w = (int)ww_i;
int grid_u = (int)pos_u;
int grid_v = (int)pos_v;
// check the boundaries
long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
//printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);
// Convolve this point onto the grid.
for (k = kmin; k <= kmax; k++)
{
double v_dist = (double)k+0.5 - pos_v;
for (j = jmin; j <= jmax; j++)
{
double u_dist = (double)j+0.5 - pos_u;
long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
//printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);
double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
// Loops over frequencies and polarizations
double add_term_real = 0.0;
double add_term_img = 0.0;
long ifine = visindex;
// FOR THE MOMENT ASSUME ONE FREQUENCY PER CHANNEL AND ONE POLARISAZION
// FIRST TERM IS CALCULATED, REMAINING TERMS ARE SKIPPED
// COMMENTED DOUBLE LOOP BELOW IS THE ORIGINAL GRIDDING ALGORITHM (FOR REFERENCE)
vis_real[ifine] += grid[iKer]*conv_weight;
vis_img[ifine] += grid[iKer]*conv_weight;
/*
for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
{
long iweight = visindex/freq_per_chan;
for (long ipol=0; ipol<polarizations; ipol++)
{
double vistest = (double)vis_real[ifine];
if (!isnan(vistest))
{
add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
}
ifine++;
iweight++;
}
}
atomicAdd(&(grid[iKer]),add_term_real);
atomicAdd(&(grid[iKer+1]),add_term_img);
*/
}
}
}
}
#endif
#ifdef ACCOMP
#pragma omp declare target
#endif
void degrid(
int num_w_planes,
long num_points,
long freq_per_chan,
long polarizations,
double* uu,
double* vv,
double* ww,
float* vis_real,
float* vis_img,
float* weight,
double dx,
double dw,
int w_support,
int grid_size_x,
int grid_size_y,
double* grid,
int num_threads)
{
long i;
long index;
long visindex;
// initialize the convolution kernel
// gaussian:
int KernelLen = (w_support-1)/2;
double std = 1.0;
double std22 = 1.0/(2.0*std*std);
double norm = std22/PI;
// Loop over visibilities.
// Switch between CUDA and GPU versions
#ifdef __CUDACC__
// Define the CUDA set up
int Nth = NTHREADS;
long Nbl = (long)(num_points/Nth) + 1;
if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
long Nvis = num_points*freq_per_chan*polarizations;
printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
// Create GPU arrays and offload them
double * uu_g;
double * vv_g;
double * ww_g;
float * vis_real_g;
float * vis_img_g;
float * weight_g;
double * grid_g;
//for (int i=0; i<100000; i++)grid[i]=23.0;
cudaError_t mmm;
mmm=cudaMalloc(&uu_g,num_points*sizeof(double));
mmm=cudaMalloc(&vv_g,num_points*sizeof(double));
mmm=cudaMalloc(&ww_g,num_points*sizeof(double));
mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float));
mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice);
// Call main GPU Kernel
convolve_g <<<Nbl,Nth>>> (
num_w_planes,
num_points,
freq_per_chan,
polarizations,
uu_g,
vv_g,
ww_g,
vis_real_g,
vis_img_g,
weight_g,
dx,
dw,
KernelLen,
grid_size_x,
grid_size_y,
grid_g,
std22);
mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost);
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));
mmm=cudaFree(uu_g);
mmm=cudaFree(vv_g);
mmm=cudaFree(ww_g);
mmm=cudaFree(vis_real_g);
mmm=cudaFree(vis_img_g);
mmm=cudaFree(weight_g);
mmm=cudaFree(grid_g);
// Switch between CUDA and GPU versions
# else
#ifdef _OPENMP
omp_set_num_threads(num_threads);
#endif
#ifdef ACCOMP
long Nvis = num_points*freq_per_chan*polarizations;
// #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan])
// #pragma omp target teams distribute parallel for map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y])
#else
#pragma omp parallel for private(visindex)
#endif
for (i = 0; i < num_points; i++)
{
#ifdef _OPENMP
//int tid;
//tid = omp_get_thread_num();
//printf("%d\n",tid);
#endif
visindex = i*freq_per_chan*polarizations;
double sum = 0.0;
int j, k;
//if (i%1000 == 0)printf("%ld\n",i);
/* Convert UV coordinates to grid coordinates. */
double pos_u = uu[i] / dx;
double pos_v = vv[i] / dx;
double ww_i = ww[i] / dw;
int grid_w = (int)ww_i;
int grid_u = (int)pos_u;
int grid_v = (int)pos_v;
// check the boundaries
long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
//printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
// Convolve this point onto the grid.
for (k = kmin; k <= kmax; k++)
{
double v_dist = (double)k+0.5 - pos_v;
for (j = jmin; j <= jmax; j++)
{
double u_dist = (double)j+0.5 - pos_u;
long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
// Loops over frequencies and polarizations
double add_term_real = 0.0;
double add_term_img = 0.0;
long ifine = visindex;
// FOR THE MOMENT ASSUME ONE FREQUENCY PER CHANNEL AND ONE POLARISAZION
// FIRST TERM IS CALCULATED, REMAINING TERMS ARE SKIPPED
// COMMENTED DOUBLE LOOP BELOW IS THE ORIGINAL GRIDDING ALGORITHM (FOR REFERENCE)
vis_real[ifine] += grid[iKer]*conv_weight;
vis_img[ifine] += grid[iKer]*conv_weight;
/*
// DAV: the following two loops are performend by each thread separately: no problems of race conditions
for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
{
long iweight = visindex/freq_per_chan;
for (long ipol=0; ipol<polarizations; ipol++)
{
if (!isnan(vis_real[ifine]))
{
//printf("%f %ld\n",weight[iweight],iweight);
add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
//if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
}
ifine++;
iweight++;
}
}
// DAV: this is the critical call in terms of correctness of the results and of performance
#pragma omp atomic
grid[iKer] += add_term_real;
#pragma omp atomic
grid[iKer+1] += add_term_img;
*/
}
}
}
// End switch between CUDA and CPU versions
#endif
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
}
degrid.cu 0 → 100644
#ifdef _OPENMP
#include <omp.h>
#endif
#include "convkernels.h"
#include "degrid.h"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#ifdef ACCOMP
#pragma omp end declare target
#endif
#ifdef __CUDACC__
//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
//{
// double conv_weight;
// conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
// return conv_weight;
//}
__global__ void convolve_g(
int num_w_planes,
long num_points,
long freq_per_chan,
long polarizations,
double* uu,
double* vv,
double* ww,
float* vis_real,
float* vis_img,
float* weight,
double dx,
double dw,
int KernelLen,
int grid_size_x,
int grid_size_y,
double* grid,
double std22)
{
long gid = blockIdx.x*blockDim.x + threadIdx.x;
if(gid < num_points)
{
long i = gid;
long visindex = i*freq_per_chan*polarizations;
double norm = std22/PI;
int j, k;
/* Convert UV coordinates to grid coordinates. */
double pos_u = uu[i] / dx;
double pos_v = vv[i] / dx;
double ww_i = ww[i] / dw;
int grid_w = (int)ww_i;
int grid_u = (int)pos_u;
int grid_v = (int)pos_v;
// check the boundaries
long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
//printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);
// Convolve this point onto the grid.
for (k = kmin; k <= kmax; k++)
{
double v_dist = (double)k+0.5 - pos_v;
for (j = jmin; j <= jmax; j++)
{
double u_dist = (double)j+0.5 - pos_u;
long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
//printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);
double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
// Loops over frequencies and polarizations
double add_term_real = 0.0;
double add_term_img = 0.0;
long ifine = visindex;
// FOR THE MOMENT ASSUME ONE FREQUENCY PER CHANNEL AND ONE POLARISAZION
// FIRST TERM IS CALCULATED, REMAINING TERMS ARE SKIPPED
// COMMENTED DOUBLE LOOP BELOW IS THE ORIGINAL GRIDDING ALGORITHM (FOR REFERENCE)
vis_real[ifine] += grid[iKer]*conv_weight;
vis_img[ifine] += grid[iKer]*conv_weight;
/*
for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
{
long iweight = visindex/freq_per_chan;
for (long ipol=0; ipol<polarizations; ipol++)
{
double vistest = (double)vis_real[ifine];
if (!isnan(vistest))
{
add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
}
ifine++;
iweight++;
}
}
atomicAdd(&(grid[iKer]),add_term_real);
atomicAdd(&(grid[iKer+1]),add_term_img);
*/
}
}
}
}
#endif
#ifdef ACCOMP
#pragma omp declare target
#endif
void degrid(
int num_w_planes,
long num_points,
long freq_per_chan,
long polarizations,
double* uu,
double* vv,
double* ww,
float* vis_real,
float* vis_img,
float* weight,
double dx,
double dw,
int w_support,
int grid_size_x,
int grid_size_y,
double* grid,
int num_threads)
{
long i;
long index;
long visindex;
// initialize the convolution kernel
// gaussian:
int KernelLen = (w_support-1)/2;
double std = 1.0;
double std22 = 1.0/(2.0*std*std);
double norm = std22/PI;
// Loop over visibilities.
// Switch between CUDA and GPU versions
#ifdef __CUDACC__
// Define the CUDA set up
int Nth = NTHREADS;
long Nbl = (long)(num_points/Nth) + 1;
if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
long Nvis = num_points*freq_per_chan*polarizations;
printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
// Create GPU arrays and offload them
double * uu_g;
double * vv_g;
double * ww_g;
float * vis_real_g;
float * vis_img_g;
float * weight_g;
double * grid_g;
//for (int i=0; i<100000; i++)grid[i]=23.0;
cudaError_t mmm;
mmm=cudaMalloc(&uu_g,num_points*sizeof(double));
mmm=cudaMalloc(&vv_g,num_points*sizeof(double));
mmm=cudaMalloc(&ww_g,num_points*sizeof(double));
mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float));
mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice);
// Call main GPU Kernel
convolve_g <<<Nbl,Nth>>> (
num_w_planes,
num_points,
freq_per_chan,
polarizations,
uu_g,
vv_g,
ww_g,
vis_real_g,
vis_img_g,
weight_g,
dx,
dw,
KernelLen,
grid_size_x,
grid_size_y,
grid_g,
std22);
mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost);
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));
mmm=cudaFree(uu_g);
mmm=cudaFree(vv_g);
mmm=cudaFree(ww_g);
mmm=cudaFree(vis_real_g);
mmm=cudaFree(vis_img_g);
mmm=cudaFree(weight_g);
mmm=cudaFree(grid_g);
// Switch between CUDA and GPU versions
# else
#ifdef _OPENMP
omp_set_num_threads(num_threads);
#endif
#ifdef ACCOMP
long Nvis = num_points*freq_per_chan*polarizations;
// #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan])
// #pragma omp target teams distribute parallel for map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y])
#else
#pragma omp parallel for private(visindex)
#endif
for (i = 0; i < num_points; i++)
{
#ifdef _OPENMP
//int tid;
//tid = omp_get_thread_num();
//printf("%d\n",tid);
#endif
visindex = i*freq_per_chan*polarizations;
double sum = 0.0;
int j, k;
//if (i%1000 == 0)printf("%ld\n",i);
/* Convert UV coordinates to grid coordinates. */
double pos_u = uu[i] / dx;
double pos_v = vv[i] / dx;
double ww_i = ww[i] / dw;
int grid_w = (int)ww_i;
int grid_u = (int)pos_u;
int grid_v = (int)pos_v;
// check the boundaries
long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
//printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
// Convolve this point onto the grid.
for (k = kmin; k <= kmax; k++)
{
double v_dist = (double)k+0.5 - pos_v;
for (j = jmin; j <= jmax; j++)
{
double u_dist = (double)j+0.5 - pos_u;
long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
// Loops over frequencies and polarizations
double add_term_real = 0.0;
double add_term_img = 0.0;
long ifine = visindex;
// FOR THE MOMENT ASSUME ONE FREQUENCY PER CHANNEL AND ONE POLARISAZION
// FIRST TERM IS CALCULATED, REMAINING TERMS ARE SKIPPED
// COMMENTED DOUBLE LOOP BELOW IS THE ORIGINAL GRIDDING ALGORITHM (FOR REFERENCE)
vis_real[ifine] += grid[iKer]*conv_weight;
vis_img[ifine] += grid[iKer]*conv_weight;
/*
// DAV: the following two loops are performend by each thread separately: no problems of race conditions
for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
{
long iweight = visindex/freq_per_chan;
for (long ipol=0; ipol<polarizations; ipol++)
{
if (!isnan(vis_real[ifine]))
{
//printf("%f %ld\n",weight[iweight],iweight);
add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
//if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
}
ifine++;
iweight++;
}
}
// DAV: this is the critical call in terms of correctness of the results and of performance
#pragma omp atomic
grid[iKer] += add_term_real;
#pragma omp atomic
grid[iKer+1] += add_term_img;
*/
}
}
}
// End switch between CUDA and CPU versions
#endif
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
}
degrid.h 0 → 100644
#ifndef W_DEGRID_H_
#define W_DEGRID_H_
#define NWORKERS -1 //100
#define NTHREADS 32
#define PI 3.14159265359
#define REAL_TYPE double
#ifdef __CUDACC__
extern "C"
#endif
void degrid(
int,
long,
long,
long,
double*,
double*,
double*,
float*,
float*,
float*,
double,
double,
int,
int,
int,
double*,
int);
#endif
......@@ -17,7 +17,7 @@
#ifdef ACCOMP
#include "w-stacking_omp.h"
#else
#include "w-stacking.h"
#include "degrid.h"
#endif
#define PI 3.14159265359
#define NUM_OF_SECTORS -1
......@@ -377,12 +377,12 @@ if(rank == 0){
long * inc = (long *) malloc(sizeof(long)*naxis);
int anynul;
fpixel[0] = 1;
lpixel[0] = xaxis;
inc[0] = 1;
fpixel[1] = rank*yaxis+1;
lpixel[1] = (rank+1)*yaxis+1;
fpixel[1] = 1;
lpixel[1] = xaxis;
inc[1] = 1;
fpixel[0] = rank*yaxis+1;
lpixel[0] = (rank+1)*yaxis;
inc[0] = 1;
fits_read_subset(fptr, TDOUBLE, fpixel, lpixel, inc, NULL, image_real, &anynul, &status);
//fits_read_img(fptr, TDOUBLE, 1, xaxis*yaxis, NULL, image_real, &anynul, &status);
fits_close_file(fptr, &status);
......@@ -407,13 +407,15 @@ if(rank == 0){
fftw_plan plan;
ptrdiff_t alloc_local, local_n0, local_0_start;
// FFTW double input variable whose size is = local_n0 x 2(Nx/2+1)
// FFTW double input variable whose size is = Ny x 2(Nx/2+1)
double *fftwimage;
// FFTW complex output variable whose size is = local_n0 x Nx/2 + 1
// FFTW complex output variable whose size is = Ny x (Nx/2+1)
fftw_complex *fftwgrid;
alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD, &local_n0, &local_0_start);
fftwimage = fftw_alloc_real(2 * alloc_local);
// this is equivalent to:
//fftwimage = (double*)calloc(yaxis*(2*(xaxis/2+1)),sizeof(double));
fftwgrid = fftw_alloc_complex(alloc_local);
// Create plan for r2c DFT
......@@ -423,13 +425,15 @@ if(rank == 0){
// Initialize FFTW arrays
long fftwindex = 0;
long fftwindex2D = 0;
//for (long iv = 0; iv < yaxis; iv++)
for (long iv = 0; iv < local_n0; iv++)
for (long iu = 0; iu < xaxis; iu++)
{
fftwindex2D = iu + iv*xaxis;
fftwimage[fftwindex2D] = image_real[fftwindex2D];
fftwgrid[fftwindex2D][0] = 0.0;
fftwgrid[fftwindex2D][1] = 0.0;
fftwindex2D = iu + iv*(2*(xaxis/2+1));
fftwimage[fftwindex2D] = image_real[fftwindex];
//fftwgrid[fftwindex2D][0] = 0.0;
//fftwgrid[fftwindex2D][1] = 0.0;
fftwindex++;
}
// Perform the FFT
......@@ -441,44 +445,31 @@ if(rank == 0){
double * grid;
long size_of_grid = 2*num_w_planes*xaxis*yaxis;
grid = (double*) calloc(size_of_grid,sizeof(double));
fftwindex = 0;
long fftwindex2;
for (int iw=0; iw<num_w_planes; iw++)
{
for (int iv=0; iv<yaxis; iv++)
{
for (int iu=0; iu<xaxis; iu++)
for (int iu=0; iu<xaxis/2; iu++)
{
fftwindex2D = iu + iv*xaxis;
fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
grid[fftwindex] = fftwgrid[fftwindex2D][0];
grid[fftwindex+1] = fftwgrid[fftwindex2D][1];
fftwindex2D = iu + iv*(xaxis/2+1);
fftwindex2 = 2*(fftwindex);
//fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
grid[fftwindex2] = fftwgrid[fftwindex2D][0];
grid[fftwindex2+1] = fftwgrid[fftwindex2D][1];
#ifdef WRITE_DATA
grid_real[fftwindex2D] = fftwgrid[fftwindex2D][0];
grid_img[fftwindex2D] = fftwgrid[fftwindex2D][1];
grid_real[fftwindex] = fftwgrid[fftwindex2D][0];
grid_img[fftwindex] = fftwgrid[fftwindex2D][1];
#endif
fftwindex++;
}
}
}
fftw_free(fftwgrid);
fftw_free(fftwimage);
free(fftwimage);
/*
// This may be moved inside the WRITE_DATA section: TO BE CHECKED
// Create sector grid
double * gridss;
double * gridss_w;
double * gridss_real;
double * gridss_img;
size_of_grid = 2*num_w_planes*xaxis*yaxis;
gridss = (double*) calloc(size_of_grid,sizeof(double));
gridss_w = (double*) calloc(size_of_grid,sizeof(double));
gridss_real = (double*) calloc(size_of_grid/2,sizeof(double));
gridss_img = (double*) calloc(size_of_grid/2,sizeof(double));
// Create temporary global grid
#ifndef USE_MPI
double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
#endif
*/
// Open the MPI Memory Window for the slab
#ifdef USE_MPI
......@@ -528,7 +519,7 @@ if(rank == 0){
#endif
#endif //WRITE_DATA
exit(3);
//exit(3);
if(rank == 0)printf("CREATING LINKED LISTS\n");
......@@ -669,11 +660,12 @@ if(rank == 0){
compose_time1 += (finishk.tv_sec - begink.tv_sec);
compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
#ifndef USE_MPI
//#ifndef USE_MPI
double uumin = 1e20;
double vvmin = 1e20;
double uumax = -1e20;
double vvmax = -1e20;
pFile = fopen (outfile1,"w");
for (long ipart=0; ipart<Nsec; ipart++)
{
......@@ -687,20 +679,38 @@ if(rank == 0){
}
printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
#endif
fclose(pFile);
printf("============> 3\n");
//#endif
#ifdef STOPHERE
// Make convolution on the grid
// Make degriddig on the visibilities
#ifdef VERBOSE
printf("Processing sector %ld\n",isector);
#endif
clock_gettime(CLOCK_MONOTONIC, &begink);
startk = clock();
/*
// Create sector grid
double * gridss_w;
double * gridss_real;
double * gridss_img;
size_of_grid = 2*num_w_planes*xaxis*yaxis;
gridss_w = (double*) calloc(size_of_grid,sizeof(double));
gridss_real = (double*) calloc(size_of_grid/2,sizeof(double));
gridss_img = (double*) calloc(size_of_grid/2,sizeof(double));
// Create temporary global grid
#ifndef USE_MPI
double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
#endif
*/
wstack(num_w_planes,
double * gridss;
gridss = (double*) calloc(size_of_grid,sizeof(double));
degrid(num_w_planes,
Nsec,
freq_per_chan,
polarisations,
......@@ -718,6 +728,7 @@ if(rank == 0){
gridss,
num_threads);
#ifdef STOPHERE
/*
int z =0 ;
#pragma omp target map(to:test_i_gpu) map(from:z)
......
#ifdef _OPENMP
#include <omp.h>
#endif
#include "convkernels.h"
#include "w-stacking.h"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#ifdef ACCOMP
#pragma omp declare target
#endif
#ifdef __CUDACC__
double __device__
#else
double
#endif
gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
{
double conv_weight;
conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
return conv_weight;
}
#ifdef ACCOMP
#pragma omp end declare target
#endif
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment