Skip to content
Snippets Groups Projects
Commit 9bf1af25 authored by Claudio Gheller's avatar Claudio Gheller
Browse files

first commit

parent c103410e
No related branches found
No related tags found
No related merge requests found
Makefile 0 → 100644
OPT += -DUSE_MPI
OPT += -DUSE_FFTW
OPT += -DONE_SIDE
OPT += -DWRITE_DATA
CC = gcc
CXX = g++
MPICC = mpicc
MPICXX =mpiCC
CFLAGS += -O3 -mcpu=native
CFLAGS += -I.
LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
NVCC = nvcc
NVFLAGS = -arch=sm_70 -c w-stacking.cu -Xcompiler -mno-float128 -std=c++11
NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
DEPS = w-stacking.h
COBJ = w-stacking.o w-stacking-fftw.o
w-stacking.c:
cp w-stacking.cu w-stacking.c
%.o: %.c $(DEPS)
$(CC) -c -o $@ $< $(CFLAGS) $(OPT)
serial: $(COBJ)
$(CC) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm
serial_cuda:
$(NVCC) $(NVFLAGS) -c w-stacking.cu $(NVLIB)
$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o $(NVLIB) -lm
mpi: $(COBJ)
$(MPICC) -o w-stackingCfftw $(CFLAGS) $^ $(LIBS)
mpi_cuda:
$(NVCC) $(NVFLAGS) -c w-stacking.cu $(NVLIB)
$(MPICC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
$(MPICXX) $(CFLAGS) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o $(NVLIB) $(LIBS) -lm
clean:
rm *.o
rm w-stacking.c
peano.c 0 → 100644
#include "peano.h"
//typedef unsigned long long peanokey;
/* The following rewrite of the original function
* peano_hilbert_key_old() has been written by MARTIN REINECKE.
* It is about a factor 2.3 - 2.5 faster than Volker's old routine!
*/
const unsigned char rottable3[48][8] = {
{36, 28, 25, 27, 10, 10, 25, 27},
{29, 11, 24, 24, 37, 11, 26, 26},
{8, 8, 25, 27, 30, 38, 25, 27},
{9, 39, 24, 24, 9, 31, 26, 26},
{40, 24, 44, 32, 40, 6, 44, 6},
{25, 7, 33, 7, 41, 41, 45, 45},
{4, 42, 4, 46, 26, 42, 34, 46},
{43, 43, 47, 47, 5, 27, 5, 35},
{33, 35, 36, 28, 33, 35, 2, 2},
{32, 32, 29, 3, 34, 34, 37, 3},
{33, 35, 0, 0, 33, 35, 30, 38},
{32, 32, 1, 39, 34, 34, 1, 31},
{24, 42, 32, 46, 14, 42, 14, 46},
{43, 43, 47, 47, 25, 15, 33, 15},
{40, 12, 44, 12, 40, 26, 44, 34},
{13, 27, 13, 35, 41, 41, 45, 45},
{28, 41, 28, 22, 38, 43, 38, 22},
{42, 40, 23, 23, 29, 39, 29, 39},
{41, 36, 20, 36, 43, 30, 20, 30},
{37, 31, 37, 31, 42, 40, 21, 21},
{28, 18, 28, 45, 38, 18, 38, 47},
{19, 19, 46, 44, 29, 39, 29, 39},
{16, 36, 45, 36, 16, 30, 47, 30},
{37, 31, 37, 31, 17, 17, 46, 44},
{12, 4, 1, 3, 34, 34, 1, 3},
{5, 35, 0, 0, 13, 35, 2, 2},
{32, 32, 1, 3, 6, 14, 1, 3},
{33, 15, 0, 0, 33, 7, 2, 2},
{16, 0, 20, 8, 16, 30, 20, 30},
{1, 31, 9, 31, 17, 17, 21, 21},
{28, 18, 28, 22, 2, 18, 10, 22},
{19, 19, 23, 23, 29, 3, 29, 11},
{9, 11, 12, 4, 9, 11, 26, 26},
{8, 8, 5, 27, 10, 10, 13, 27},
{9, 11, 24, 24, 9, 11, 6, 14},
{8, 8, 25, 15, 10, 10, 25, 7},
{0, 18, 8, 22, 38, 18, 38, 22},
{19, 19, 23, 23, 1, 39, 9, 39},
{16, 36, 20, 36, 16, 2, 20, 10},
{37, 3, 37, 11, 17, 17, 21, 21},
{4, 17, 4, 46, 14, 19, 14, 46},
{18, 16, 47, 47, 5, 15, 5, 15},
{17, 12, 44, 12, 19, 6, 44, 6},
{13, 7, 13, 7, 18, 16, 45, 45},
{4, 42, 4, 21, 14, 42, 14, 23},
{43, 43, 22, 20, 5, 15, 5, 15},
{40, 12, 21, 12, 40, 6, 23, 6},
{13, 7, 13, 7, 41, 41, 22, 20}
};
const unsigned char subpix3[48][8] = {
{0, 7, 1, 6, 3, 4, 2, 5},
{7, 4, 6, 5, 0, 3, 1, 2},
{4, 3, 5, 2, 7, 0, 6, 1},
{3, 0, 2, 1, 4, 7, 5, 6},
{1, 0, 6, 7, 2, 3, 5, 4},
{0, 3, 7, 4, 1, 2, 6, 5},
{3, 2, 4, 5, 0, 1, 7, 6},
{2, 1, 5, 6, 3, 0, 4, 7},
{6, 1, 7, 0, 5, 2, 4, 3},
{1, 2, 0, 3, 6, 5, 7, 4},
{2, 5, 3, 4, 1, 6, 0, 7},
{5, 6, 4, 7, 2, 1, 3, 0},
{7, 6, 0, 1, 4, 5, 3, 2},
{6, 5, 1, 2, 7, 4, 0, 3},
{5, 4, 2, 3, 6, 7, 1, 0},
{4, 7, 3, 0, 5, 6, 2, 1},
{6, 7, 5, 4, 1, 0, 2, 3},
{7, 0, 4, 3, 6, 1, 5, 2},
{0, 1, 3, 2, 7, 6, 4, 5},
{1, 6, 2, 5, 0, 7, 3, 4},
{2, 3, 1, 0, 5, 4, 6, 7},
{3, 4, 0, 7, 2, 5, 1, 6},
{4, 5, 7, 6, 3, 2, 0, 1},
{5, 2, 6, 1, 4, 3, 7, 0},
{7, 0, 6, 1, 4, 3, 5, 2},
{0, 3, 1, 2, 7, 4, 6, 5},
{3, 4, 2, 5, 0, 7, 1, 6},
{4, 7, 5, 6, 3, 0, 2, 1},
{6, 7, 1, 0, 5, 4, 2, 3},
{7, 4, 0, 3, 6, 5, 1, 2},
{4, 5, 3, 2, 7, 6, 0, 1},
{5, 6, 2, 1, 4, 7, 3, 0},
{1, 6, 0, 7, 2, 5, 3, 4},
{6, 5, 7, 4, 1, 2, 0, 3},
{5, 2, 4, 3, 6, 1, 7, 0},
{2, 1, 3, 0, 5, 6, 4, 7},
{0, 1, 7, 6, 3, 2, 4, 5},
{1, 2, 6, 5, 0, 3, 7, 4},
{2, 3, 5, 4, 1, 0, 6, 7},
{3, 0, 4, 7, 2, 1, 5, 6},
{1, 0, 2, 3, 6, 7, 5, 4},
{0, 7, 3, 4, 1, 6, 2, 5},
{7, 6, 4, 5, 0, 1, 3, 2},
{6, 1, 5, 2, 7, 0, 4, 3},
{5, 4, 6, 7, 2, 3, 1, 0},
{4, 3, 7, 0, 5, 2, 6, 1},
{3, 2, 0, 1, 4, 5, 7, 6},
{2, 5, 1, 6, 3, 4, 0, 7}
};
/*! This function computes a Peano-Hilbert key for an integer triplet (x,y,z),
* with x,y,z in the range between 0 and 2^bits-1.
*/
peanokey peano_hilbert_key(int x, int y, int z, int bits)
{
int mask;
unsigned char rotation = 0;
peanokey key = 0;
for(mask = 1 << (bits - 1); mask > 0; mask >>= 1)
{
unsigned char pix = ((x & mask) ? 4 : 0) | ((y & mask) ? 2 : 0) | ((z & mask) ? 1 : 0);
key <<= 3;
key |= subpix3[rotation][pix];
rotation = rottable3[rotation][pix];
}
return key;
}
typedef unsigned long long peanokey;
peanokey peano_hilbert_key(int, int, int, int);
This diff is collapsed.
#ifdef _OPENMP
#include <omp.h>
#endif
#include "w-stacking.h"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define NWORKERS -1 //100
#define NTHREADS 32
#ifdef __CUDACC__
double __device__
#else
double
#endif
gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
{
double conv_weight;
conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
return conv_weight;
}
#ifdef __CUDACC__
//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
//{
// double conv_weight;
// conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
// return conv_weight;
//}
__global__ void convolve_g(
int num_w_planes,
long num_points,
long freq_per_chan,
long polarizations,
double* uu,
double* vv,
double* ww,
float* vis_real,
float* vis_img,
float* weight,
double dx,
double dw,
int KernelLen,
int grid_size_x,
int grid_size_y,
double* grid,
double std22)
{
//printf("DENTRO AL KERNEL\n");
long gid = blockIdx.x*blockDim.x + threadIdx.x;
if(gid < num_points)
{
long i = gid;
long visindex = i*freq_per_chan*polarizations;
double norm = std22/PI;
int j, k;
/* Convert UV coordinates to grid coordinates. */
double pos_u = uu[i] / dx;
double pos_v = vv[i] / dx;
double ww_i = ww[i] / dw;
int grid_w = (int)ww_i;
int grid_u = (int)pos_u;
int grid_v = (int)pos_v;
// check the boundaries
long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
//printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);
// Convolve this point onto the grid.
for (k = kmin; k <= kmax; k++)
{
double v_dist = (double)k+0.5 - pos_v;
for (j = jmin; j <= jmax; j++)
{
double u_dist = (double)j+0.5 - pos_u;
long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
//printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);
double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
// Loops over frequencies and polarizations
double add_term_real = 0.0;
double add_term_img = 0.0;
long ifine = visindex;
for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
{
long iweight = visindex/freq_per_chan;
for (long ipol=0; ipol<polarizations; ipol++)
{
double vistest = (double)vis_real[ifine];
if (!isnan(vistest))
{
add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
}
ifine++;
iweight++;
}
}
atomicAdd(&(grid[iKer]),add_term_real);
atomicAdd(&(grid[iKer+1]),add_term_img);
}
}
}
}
#endif
void wstack(
int num_w_planes,
long num_points,
long freq_per_chan,
long polarizations,
double* uu,
double* vv,
double* ww,
float* vis_real,
float* vis_img,
float* weight,
double dx,
double dw,
int w_support,
int grid_size_x,
int grid_size_y,
double* grid,
int num_threads)
{
long i;
long index;
long visindex;
// initialize the convolution kernel
// gaussian:
int KernelLen = (w_support-1)/2;
double std = 1.0;
double std22 = 1.0/(2.0*std*std);
double norm = std22/PI;
// Loop over visibilities.
// Switch between CUDA and GPU versions
#ifdef __CUDACC__
// Define the CUDA set up
int Nth = NTHREADS;
int Nbl = num_points/Nth + 1;
if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
long Nvis = num_points*freq_per_chan*polarizations;
printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
// Create GPU arrays and offload them
double * uu_g;
double * vv_g;
double * ww_g;
float * vis_real_g;
float * vis_img_g;
float * weight_g;
double * grid_g;
//for (int i=0; i<100000; i++)grid[i]=23.0;
cudaError_t mmm;
mmm=cudaMalloc(&uu_g,num_points*sizeof(double));
mmm=cudaMalloc(&vv_g,num_points*sizeof(double));
mmm=cudaMalloc(&ww_g,num_points*sizeof(double));
mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float));
mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice);
// Call main GPU Kernel
convolve_g <<<Nbl,Nth>>> (
num_w_planes,
num_points,
freq_per_chan,
polarizations,
uu_g,
vv_g,
ww_g,
vis_real_g,
vis_img_g,
weight_g,
dx,
dw,
KernelLen,
grid_size_x,
grid_size_y,
grid_g,
std22);
mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost);
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));
mmm=cudaFree(uu_g);
mmm=cudaFree(vv_g);
mmm=cudaFree(ww_g);
mmm=cudaFree(vis_real_g);
mmm=cudaFree(vis_img_g);
mmm=cudaFree(weight_g);
mmm=cudaFree(grid_g);
// Switch between CUDA and GPU versions
# else
#ifdef _OPENMP
omp_set_num_threads(num_threads);
#endif
#pragma omp parallel for private(visindex)
for (i = 0; i < num_points; i++)
{
#ifdef _OPENMP
//int tid;
//tid = omp_get_thread_num();
//printf("%d\n",tid);
#endif
visindex = i*freq_per_chan*polarizations;
double sum = 0.0;
int j, k;
//if (i%1000 == 0)printf("%ld\n",i);
/* Convert UV coordinates to grid coordinates. */
double pos_u = uu[i] / dx;
double pos_v = vv[i] / dx;
double ww_i = ww[i] / dw;
int grid_w = (int)ww_i;
int grid_u = (int)pos_u;
int grid_v = (int)pos_v;
// check the boundaries
long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
//printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
// Convolve this point onto the grid.
for (k = kmin; k <= kmax; k++)
{
double v_dist = (double)k+0.5 - pos_v;
for (j = jmin; j <= jmax; j++)
{
double u_dist = (double)j+0.5 - pos_u;
long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
// Loops over frequencies and polarizations
double add_term_real = 0.0;
double add_term_img = 0.0;
long ifine = visindex;
// DAV: the following two loops are performend by each thread separately: no problems of race conditions
for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
{
long iweight = visindex/freq_per_chan;
for (long ipol=0; ipol<polarizations; ipol++)
{
if (!isnan(vis_real[ifine]))
{
//printf("%f %ld\n",weight[iweight],iweight);
add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
//if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
}
ifine++;
iweight++;
}
}
// DAV: this is the critical call in terms of correctness of the results and of performance
#pragma omp atomic
grid[iKer] += add_term_real;
#pragma omp atomic
grid[iKer+1] += add_term_img;
}
}
}
// End switch between CUDA and CPU versions
#endif
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
}
int test(int nnn)
{
int mmm;
mmm = nnn+1;
return mmm;
}
#ifndef W_PROJECT_H_
#define W_PROJECT_H_
#define PI 3.14159265359
#define REAL_TYPE double
#ifdef __CUDACC__
extern "C"
#endif
void wstack(
int,
long,
long,
long,
double*,
double*,
double*,
float*,
float*,
float*,
double,
double,
int,
int,
int,
double*,
int);
#ifdef __CUDACC__
extern "C"
#endif
int test(int nnn);
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment