diff --git a/Build/Makefile.leo b/Build/Makefile.leo
index 5e9aaab67d7c1fe678075a19f37669894b150121..ce350a18f3de19fa7b3f91c9238bfd7d6f0a3d1a 100644
--- a/Build/Makefile.leo
+++ b/Build/Makefile.leo
@@ -9,8 +9,8 @@ OPT_PURE_MPI = -O4 -march=native -mavx -mavx2
 
 OMP_GPU = -mp=multicore,gpu -gpu=cuda11.8 -gpu=cc80
 
-CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/include
-CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/targets/x86_64-linux/lib/stubs
+CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/include
+CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/targets/x86_64-linux/lib/stubs
 
 FFTW_INCL=
 FFTW_LIB=
@@ -19,18 +19,18 @@ FFTW_LIB=
 ##########################################################
 #NVIDIA CUFFTMP
 
-CUFFTMP_LIB  = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/lib64
-CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/include/cufftmp
+CUFFTMP_LIB  = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64
+CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/include/cufftmp
 ##########################################################
 
-NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/include
-NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/lib
+NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/include/
+NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/
 
 ##########################################################
 #NVIDIA NCCL REDUCE
 
-NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/include
-NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/lib
+NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/include
+NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/lib
 ##########################################################
 
 NVC = nvc 
@@ -44,7 +44,7 @@ NVLIB_2 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(CUFFTMP_INC) $(CUFFTMP
 NVLIB_3 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(NCCL_INC) $(NCCL_LIB) -lcudart -lnccl
 
 
-NVCC = /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/11.8/bin/nvcc
+NVCC = nvcc
 OPT_NVCC   = -std=c++17 --generate-code arch=compute_80,code=sm_80 
 
 CFLAGS +=
diff --git a/Makefile b/Makefile
index f6c49b1c92ec6b10b98f6fa72069319815bf6c6c..50b8a23224aa403909a35bd58d044f26010b5cbc 100644
--- a/Makefile
+++ b/Makefile
@@ -35,10 +35,10 @@ FFTWLIBS =
 OPT += -DUSE_FFTW
 
 # use omp-ized version of fftw routines
-OPT += -DHYBRID_FFTW
+#OPT += -DHYBRID_FFTW
 
 # switch on the OpenMP parallelization
-OPT += -DUSE_OMP
+#OPT += -DUSE_OMP
 
 # write the full 3D cube of gridded visibilities and its FFT transform
 #OPT += -DWRITE_DATA
@@ -53,7 +53,7 @@ OPT += -DPHASE_ON
 #OPT += -DFITSIO
 
 # Perform true parallel images writing
-#OPT += -DPARALLELIO
+OPT += -DPARALLELIO
 
 # Normalize uvw in case it is not done in the binMS
 #OPT += -DNORMALIZE_UVW
@@ -74,13 +74,13 @@ OPT += -DGAUSS
 #OPT += -DNVIDIA
 
 #use cuda for GPUs
-#OPT += -DCUDACC
+OPT += -DCUDACC
 
 # use GPU acceleration via OMP 
 #OPT += -DACCOMP
 
 # perform stacking on GPUs
-#OPT += -DGPU_STACKING
+OPT += -DGPU_STACKING
 
 # use NVIDIA GPU to perform the reduce
 #OPT += -DNCCL_REDUCE
@@ -89,7 +89,7 @@ OPT += -DGAUSS
 #OPT += -DRCCL_REDUCE
 
 # use GPU to perform FFT
-#OPT += -DCUFFTMP
+OPT += -DCUFFTMP
 
 #support for AMD GPUs
 #OPT += __HIP_PLATFORM_AMD__
diff --git a/README.md b/README.md
index 6f3ed770a30573b8176025bd1d41457bb1be937c..6f33ddf542c284b21fb71b52a2e1f69731977cf4 100644
--- a/README.md
+++ b/README.md
@@ -1,54 +1,131 @@
-# HPC_Imaging
+# Radio Imaging Code Kernels
 
-To compile the code, feel free to activate and deactivate options in the Makefile.
-You will find the code options before and then the acceleration options.
+Radio Imaging Code Kernels (RICK) is a software that is able to exploit parallelism and accelerators for radio astronomy imaging. **This software is currently under development**, feel free to contact us for issues, suggestions, information.<br>
 
-You can simply run the code with the command:
+RICK is written in C/C++ and can perform the following routines:
+- gridding
+- Fast Fourier Transform (FFT)
+- w-correction
 
-############################################
+It exploits the Message Passing Interface (MPI) and OpenMP for parallelism, and is able to run on both NVIDIA and AMD GPUs using CUDA, HIP, and OpenMP for GPU offloading.
 
-make w-stacking
+## Why and where to use RICK?
 
-############################################
+Given that it is currently under development, RICK can be used to test its performances to your data. It has been tested for several radio interferometers:
+- Low Frequency Array (LOFAR)
+- Jansky Very Large Array (JVLA)
+- Atacama Large Millimeter Array (ALMA)
 
-It will redirect you to the file Build/Makefile.local, which is complete enough
-apart from different library paths, feel free to use it or to change SYSTYPE.
-My aim was to make compilation as simple as possible.
+<br>
 
-When you use GPU offloading with OpenMP, please do not compile the CPU part with NVC.
-This can be easily fixed by setting the environment variable:
+## What can RICK *not* do?
+
+Currently in RICK we have not yet implemented the following steps:
+
+- there is not the possibility to separate the Stokes parameters
+- the flux scale is not physical
+- the weighting step is currently under development
+
+<br>
+
+## Preparing the dataset
+
+To be digested by RICK, the input Measurement Set must be written in binary. To do this, a Python script named `create_binMS.py` can be found in the `/scripts`directory. The supported format for MS input is the Measurement Set V2.0 standard. Given that the script needs to read some columns of the input data, we reccommend to use a Singularity image to deal with the `casacore` dependencies. You can find one [here](https://lofar-webdav.grid.sara.nl/software/shub_mirror/tikk3r/lofar-grid-hpccloud/lofar_sksp_v3.5_x86-64_generic_noavx512_ddf.sif?action=show). The script has to be modified with the input Measurement Set.
+
+## How to compile the code
 
-############################################
+The latest version of the code is in the *merge* branch. To compile it, you need first to specify in the Makefile the required configuration for the RICK execution.<br>
+Here there is a brief explaination of each macro contained in the Makefile.
+- USE_FFTW: enables the usage of the FFTW library with pure MPI
+- HYBRID_FFTW: enables the usage of the hybrid FFT implementation with MPI+OpenMP using the FFTW library
+- USE_OMP: to switch on OpenMP parallelization
+- WRITE_DATA: write the full 3D cube of gridded visibilities and its FFT transform (impacts the performance, use only for debugging or tests)
+- WRITE_IMAGE: write the final images as output
+- PHASE_ON: performs w-stacking phase correction
+- FITSIO: enables the usage of CFITSIO library to write outputs in FITS format
+- PARALLELIO: enables the parallelization for final images writing (will be deprecated)
+- NORMALIZE_UVW: normalize $u$, $v$, and $w$ in case it is not done in the binMS
+- GAUSS: select a Gaussian gridding kernel
+- GAUSS_HI_PRECISION: select an high-precision Gaussian gridding kernel 
+- KAISER_BESSEL: select a Kaiser-Bessel gridding kernel
+- VERBOSE: enable verbose (impacts the performance, only for debugging or tests)
+- NVIDIA: NVIDIA macro, required for NVIDIA calls
+- CUDACC: enables CUDA for GPU offloading
+- ACCOMP: enables OpenMP for GPU offloading
+- GPU_STACKING: enables the w-stacking step on GPUs. This can be required because, at the moment, RICK cannot do on GPUs both the reduce and the w-stacking
+- NCCL_REDUCE: use the NCCL implementation of the reduce on NVIDIA GPUs
+- RCCL_REDUCE: use the accelerated implementation of the reduce on AMD GPU
+- CUFFTMP: enables the distributed FFT on GPUs
+- DEBUG: imacts hardly on the performance, to use only for development
 
-export OMPI_CC = gcc
+**Be careful because there are some macros that are obviously not compatible with others. More detailed instructions will be given to choose the best combination of macros for your usage or architecture.**<br>
 
-###########################################
+Once selected the desired macros, you need to select the architecture on which you want to execute the code. In the directory `Build/` there are several configuration files that adapt to your machine. To select your architecture you need to define the following environment variable
+```
+> export $SYSTYPE=architecture_name
+```
+
+The latest version of RICK has been tested on the following architectures:
+- Marconi 100 (CINECA) (`M100`)
+- Leonardo (CINECA) (`leo`)
+- MacOSX (`Macosx`)
+
+If your machine is not on the list, you can define `SYSTYPE=local` or modify it according to your needs and create one ad-hoc for your configuration. <br>
+
+When you use GPU offloading with OpenMP, please **do not compile the CPU part with NVC**.
+This can be easily fixed by setting the environment variable:
+```
+> export OMPI_CC=gcc
+```
 
 In the case in which the default compiler is NVC. The Makefile is suited to understand
 which are the parts to be compiled with NVC for the OpenMP offloading.
 The final linker in this case will be however the NVC/NVC++.
 
-The problem does not raise on AMD platforms, because you use clang/clang++ for both CPUs
-and GPUs
+The problem does not raise on AMD platforms, because you use clang/clang++ for both CPUs and GPUs.
 
-The extensions of the executable will be changed depending on the different acceleration
-options.
+At this point you can compile the code with the command:
+```
+> make w-stacking
+```
 
-To run the code, the data/paramfile.txt is available. Feel free to change the paramers,
-i.e. the path of visibilities, which reduce implementation to use, the number of pixels,
-the number of OpenMP threads and so on.
+An executable will be created whose name depends on the different implementations that have been required for the compilation. The extensions of the executable will be changed depending on the different acceleration options.
+## How to execute the code
 
-Once you have compiled the code, run it simply with the command:
+To run the code, the `data/paramfile.txt` is available. Feel free to change the parameters according to your configuration and your desired output. Here we briefly list some of them:
+- Datapath: path of the input Measurement Set
+- num_threads: number of OpenMP threads
+- reduce_method: select the type of reduce to be used for the execution. 0 corresponds to the MPI Reduce, 1 corresponds to the Reduce Ring
+- grid_size_x, grid_size_y: size of the output image
+- num_w_planes: number of w-planes for the w-stacking
+- fftfile2, fftfile3: name of the output `.bin` real and imaginary files
+- logfile: name of the log file
 
-###########################################
 
-mpirun -np [n] [executable] data/paramfile.txt
+In the case in which the code has been compiled without either -fopenmp or -D_OPENMP options, the code is forced to use the standard MPI_Reduce implementation, since our reduce works only with OpenMP.
 
-###########################################
+To use the **cuFFTMp** with **nvhpc 23.5** you have to add the following paths to the environment variable `LD_LIBRARY_PATH`:
+```
+> export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/:$LD_LIBRARY_PATH"
 
-In the case in which the code has been compiled without either -fopenmp or -D_OPENMP options,
-the code is forced to use the standard MPI_Reduce implementation, since our reduce works
-only with OpenMP.
+> export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64/:$LD_LIBRARY_PATH"
+```
+When CPU hybrid MPI+OpenMP parallelism is requested, you have to select the number of threads by setting **--cpus-per-task=** in your bash script, then add the following lines:
+```
+> export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
+> export OMP_PLACES=cores
+> export OMP_PROC_BIND=close
+```
+Once you have compiled the code, run it simply with the command:
+```
+> mpirun -np [n] [executable] data/paramfile.txt
+```
 
+Then, to run the code in order to fully fill all the available cores in the node please add **--ntasks-per-socket=** in your bash script and then run:
+```
+> mpirun -np [n] --bind-to core --map-by ppr:${SLURM_NTASKS_PER_SOCKET}:socket:pe=${SLURM_CPUS_PER_TASK} -x OMP_NUM_THREADS [executable] data/paramfile.txt
+```
 
+## Contacts
 
+For feedbacks, suggestions, and requests [contact us](mailto:emanuele.derubeis2@unibo.it)!
diff --git a/scripts/bench.go_cufftmp b/scripts/bench.go_cufftmp
new file mode 100644
index 0000000000000000000000000000000000000000..31f926323a527fcf98d8082c85dc621ce45fd0eb
--- /dev/null
+++ b/scripts/bench.go_cufftmp
@@ -0,0 +1,111 @@
+#!/bin/bash
+#SBATCH -A IscrC_RICK
+#SBATCH -p boost_usr_prod
+##SBATCH --qos boost_qos_bprod
+#SBATCH -J RICK
+### number of nodes
+#SBATCH -N 1
+### number of MPI tasks per node
+#SBATCH --ntasks-per-node=4
+#SBATCH -n 4
+### number of openmp threads
+#SBATCH --cpus-per-task=8
+### number of allocated GPUs per node
+#SBATCH --gpus-per-node=4
+#SBATCH --mem=450G
+#SBATCH -o test.out
+#SBATCH -e test.err
+#SBATCH -t 03:00:00 
+
+
+module load openmpi/
+module load fftw/
+module load nvhpc/23.5
+module load cuda/
+
+export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/:$LD_LIBRARY_PATH"
+
+export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64/:$LD_LIBRARY_PATH"
+
+export OMPI_CC=gcc
+export OMPI_CXX=g++
+
+export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
+export OMP_PLACES=cores
+
+cd ../
+make -j1 clean
+rm -f w-stacking_fftw_acc-omp_acc-fft
+make -j1 w-stacking
+
+export typestring=omp_gpu_cufftmp
+export exe=w-stacking_fftw_acc-omp_acc-fft
+
+OUT_SHM=result_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK}
+OUT_SHM_RES=/leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/scripts/Tests/times_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK}_large
+
+rm -f ${OUT_SHM} ${OUT_SHM_RES}
+
+export logdir=mpi_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK}
+echo "Creating $logdir"
+rm -fr $logdir
+mkdir $logdir
+
+for itest in {1..3}
+do
+  export logfile=test_${itest}_${logdir}.log
+  echo "time mpirun -np ${SLURM_NTASKS} --bind-to core --map-by ppr:${SLURM_NTASKS_PER_NODE}:node:pe=${SLURM_CPUS_PER_TASK} -x OMP_NUM_THREADS data/paramfile.txt" > $logfile
+  time mpirun -np ${SLURM_NTASKS} --bind-to core --map-by ppr:${SLURM_NTASKS_PER_NODE}:node:pe=${SLURM_CPUS_PER_TASK} -x OMP_NUM_THREADS --mca btl self,vader /leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/${exe} data/paramfile.txt >> $logfile
+  mv $logfile $logdir
+  mv timings.dat ${logdir}/timings_${itest}.dat
+  cat ${logdir}/timings_all.dat ${logdir}/timings_${itest}.dat >> ${logdir}/timings_all.dat
+
+  Reduce_time=$( grep -w 'Reduce time :' $logdir/$logfile | gawk '{print $4}' )
+  FFTW_time=$( grep -w 'cufftMP time :' $logdir/$logfile | gawk '{print $4}' )
+  Composition_time=$( grep -w 'Array Composition time :' $logdir/$logfile | gawk '{print $5}' )
+  Writing_time=$( grep -w ' Image writing time :' $logdir/$logfile | gawk '{print $5}' )
+  Total_time=$( grep -w 'TOT time :' $logdir/$logfile | gawk '{print $4}' )
+
+  #Not relevant for the paper
+  Setup_time=$( grep -w 'Setup time:' $logdir/$logfile | gawk '{print $3}' )
+  Kernel_time=$( grep -w 'Kernel time :' $logdir/$logfile | gawk '{print $4}' )
+  Phase_time=$( grep -w 'Phase time :' $logdir/$logfile | gawk '{print $4}' )
+  ##########################
+
+  
+  echo $itest $Reduce_time $FFTW_time $Composition_time $Writing_time $Total_time $Setup_time $Kernel_time $Phase_time >> ${OUT_SHM}
+done
+
+echo -e "\n\n" >> ${OUT_SHM}
+avg_red=$( awk '{sum+=$2} END { print sum/3 }' ${OUT_SHM} )
+avg_fftw=$( awk '{sum+=$3} END { print sum/3 }' ${OUT_SHM} )
+avg_comp=$( awk '{sum+=$4} END { print sum/3 }' ${OUT_SHM} )
+avg_write=$( awk '{sum+=$5} END { print sum/3 }' ${OUT_SHM} )
+avg_tot=$( awk '{sum+=$6} END { print sum/3 }' ${OUT_SHM} )
+
+std_red=$( awk '{if($2!=""){count++;sum+=$2};y+=$2^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
+std_fftw=$( awk '{if($3!=""){count++;sum+=$3};y+=$3^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
+std_comp=$( awk '{if($4!=""){count++;sum+=$4};y+=$4^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
+std_write=$( awk '{if($5!=""){count++;sum+=$5};y+=$5^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
+std_tot=$( awk '{if($6!=""){count++;sum+=$6};y+=$6^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
+
+
+#Not relevant for the paper
+avg_setup=$( awk '{sum+=$7} END { print sum/3 }' ${OUT_SHM} )
+avg_ker=$( awk '{sum+=$8} END { print sum/3 }' ${OUT_SHM} )
+avg_phase=$( awk '{sum+=$9} END { print sum/3 }' ${OUT_SHM} )
+
+std_setup=$( awk '{if($7!=""){count++;sum+=$7};y+=$7^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
+std_ker=$( awk '{if($8!=""){count++;sum+=$8};y+=$8^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
+std_phase=$( awk '{if($9!=""){count++;sum+=$9};y+=$9^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
+##########################
+
+
+echo "Averages and standard deviations over 3 shots" >> ${OUT_SHM_RES}
+echo -e "\n" ${OUT_SHM_RES}
+echo "${SLURM_NTASKS} MPI tasks;  ${SLURM_CPUS_PER_TASK} OpenMP threads per task;  ${SLURM_GPUS_PER_NODE} GPUs per node;" >> ${OUT_SHM_RES}
+echo -e "\n\n" ${OUT_SHM_RES}
+echo $avg_red $std_red $avg_fftw $std_fftw $avg_comp $std_comp $avg_write $std_write $avg_tot $std_tot >> ${OUT_SHM_RES}
+echo -e "\n\n" ${OUT_SHM_RES}
+echo $avg_setup $std_setup $avg_ker $std_ker $avg_phase $std_phase >> ${OUT_SHM_RES} 
+rm -f ${OUT_SHM}
diff --git a/scripts/create_binMS.py b/scripts/create_binMS.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5c2937dc3f1a31dd0d317c86ac365a74898d8bd
--- /dev/null
+++ b/scripts/create_binMS.py
@@ -0,0 +1,218 @@
+USE_MPI = 0
+
+import numpy as np
+import casacore.tables as pt
+import time
+import sys
+import os
+
+#outpath = '/data/gridding/data/shortgauss_t201806301100_SBH255.binMS/'
+print(sys.argv[1])
+outpath = "/data/gridding/data/Lofarbig/"+sys.argv[1]+".binMS/"
+os.mkdir(outpath)
+
+
+ufile = 'ucoord.bin'
+vfile = 'vcoord.bin'
+wfile = 'wcoord.bin'
+weights = 'weights.bin'
+visrealfile = 'visibilities_real.bin'
+visimgfile = 'visibilities_img.bin'
+metafile = 'meta.txt'
+
+offset = 0.0
+
+if USE_MPI == 1:
+   from mpi4py import MPI
+   comm = MPI.COMM_WORLD
+   rank = comm.Get_rank()
+   size = comm.Get_size()
+   print(rank,size)
+else:
+   comm = 0
+   rank = 0
+   size = 1
+
+num_threads = 1
+
+# input MS
+readtime0 = time.time()
+#msfile = "/data/Lofar-data/results/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.ms"
+msfile = "/data/Lofar-Luca/results/"+sys.argv[1]+".ms/"
+ms = pt.table(msfile, readonly=True, ack=False)
+
+if rank == 0:
+ print("Reading ", msfile)
+ print("Writing ", outpath)
+# load data and metadata
+ with pt.table(msfile + '::SPECTRAL_WINDOW', ack=False) as freqtab:
+    freq = freqtab.getcol('REF_FREQUENCY')[0] / 1000000.0
+    freqpersample = np.mean(freqtab.getcol('RESOLUTION'))
+    timepersample = ms.getcell('INTERVAL',0)
+
+ print("Frequencies (MHz)   : ",freq)
+ print("Time interval (sec) : ",timepersample)
+
+ with pt.taql("SELECT ANTENNA1,ANTENNA2,sqrt(sumsqr(UVW)),GCOUNT() FROM $ms GROUPBY ANTENNA1,ANTENNA2") as BL:
+    ants1, ants2 = BL.getcol('ANTENNA1'), BL.getcol('ANTENNA2')
+    Ntime = BL.getcol('Col_4')[0] # number of timesteps
+    Nbaselines = len(ants1)
+
+ print("Number of timesteps : ",Ntime)
+ print("Total obs time (hrs): ",timepersample*Ntime/3600)
+ print("Number of baselines : ",Nbaselines)
+
+#sp = pt.table(msfile+'::LOFAR_ANTENNA_FIELD', readonly=True, ack=False, memorytable=True).getcol('POSITION')
+
+ ant1, ant2 = ms.getcol('ANTENNA1'), ms.getcol('ANTENNA2')
+
+ number_of_measures = Ntime * Nbaselines
+ #nm_pe_aux = int(number_of_measures / size)
+ #remaining_aux = number_of_measures % size
+ nm_pe = np.array(0) 
+ nm_pe = int(number_of_measures / size)
+ remaining = np.array(0) 
+ remaining = number_of_measures % size
+ print(nm_pe,remaining)
+
+else:
+ nm_pe = None
+ remaining = None
+
+if USE_MPI == 1:
+ nm_pe = comm.bcast(nm_pe, root=0)
+ remaining = comm.bcast(remaining, root=0)
+
+# set the data domain for each MPI rank
+startrow = rank*nm_pe
+
+if rank == size-1:
+   nm_pe = nm_pe+remaining
+print(rank,nm_pe,remaining)
+
+nrow = nm_pe
+
+# read data
+uvw = ms.getcol('UVW',startrow,nrow)
+vis = ms.getcol('DATA',startrow,nrow)
+weight = ms.getcol('WEIGHT_SPECTRUM',startrow,nrow)
+print("Freqs per channel   : ",vis.shape[1])
+print("Polarizations       : ",vis.shape[2])
+print("Number of observations : ",uvw.shape[0])
+print("Data size (MB)      : ",uvw.shape[0]*vis.shape[1]*vis.shape[2]*2*4/1024.0/1024.0)
+
+# set parameters
+num_points = uvw.shape[0]
+num_w_planes = 1 
+grid_size = 100    # number of cells of the grid
+
+# serialize arrays
+vis_ser_real = vis.real.flatten()
+vis_ser_img = vis.imag.flatten()
+print("data types: uvw = ",uvw.dtype," vis = ",vis_ser_real.dtype)
+#vis_ser = np.zeros(2*vis_ser_real.size)
+#for i in range(vis_ser_real.size):
+#    vis_ser[2*i]=vis_ser_real[i]
+#    vis_ser[2*i+1]=vis_ser_img[i]
+
+uu_ser = uvw[:,0].flatten()
+vv_ser = uvw[:,1].flatten()
+ww_ser = uvw[:,2].flatten()
+weight_ser = weight.flatten()
+grid = np.zeros(2*num_w_planes*grid_size*grid_size)  # complex!
+gridtot = np.zeros(2*num_w_planes*grid_size*grid_size)  # complex!
+peanokeys = np.empty(vis_ser_real.size,dtype=np.uint64)
+gsize = grid.size
+
+hist, bin_edges = np.histogram(ww_ser,num_w_planes)
+print(hist)
+
+print(vis_ser_real.dtype)
+
+# normalize uv
+minu = np.amin(uu_ser)
+maxu = np.amax(abs(uu_ser))
+minv = np.amin(vv_ser)
+maxv = np.amax(abs(vv_ser))
+minw = np.amin(ww_ser)
+maxw = np.amax(ww_ser)
+
+if USE_MPI == 1:
+   maxu_all = np.array(0,dtype=np.float)
+   maxv_all = np.array(0,dtype=np.float)
+   maxw_all = np.array(0,dtype=np.float)
+   minu_all = np.array(0,dtype=np.float)
+   minv_all = np.array(0,dtype=np.float)
+   minw_all = np.array(0,dtype=np.float)
+   comm.Allreduce(maxu, maxu_all, op=MPI.MAX)
+   comm.Allreduce(maxv, maxv_all, op=MPI.MAX)
+   comm.Allreduce(maxw, maxw_all, op=MPI.MAX)
+   comm.Allreduce(minu, minu_all, op=MPI.MIN)
+   comm.Allreduce(minv, minv_all, op=MPI.MIN)
+   comm.Allreduce(minw, minw_all, op=MPI.MIN)
+
+   ming = min(minu_all,minv_all)
+   maxg = max(maxu_all,maxv_all)
+   minw = minw_all
+   maxw = maxw_all
+   ming = ming-offset*ming
+   maxg = maxg+offset*maxg
+   minw = minw
+   maxw = maxw
+else:
+   ming = min(minu,minv)
+   maxg = max(maxu,maxv)
+   ming = ming-offset*ming
+   maxg = maxg+offset*maxg
+   minw = minw
+   maxw = maxw
+
+print(maxu,maxv,maxg)
+#uu_ser = (uu_ser-ming)/(maxg-ming)
+#vv_ser = (vv_ser-ming)/(maxg-ming)
+uu_ser = (uu_ser+maxg)/(2*maxg)
+vv_ser = (vv_ser+maxg)/(2*maxg)
+ww_ser = (ww_ser-minw)/(maxw-minw)
+#print(uu_ser.shape, vv_ser.dtype, ww_ser.dtype, vis_ser_real.shape, vis_ser_img.dtype, weight_ser.dtype, grid.dtype)
+print(np.amin(uu_ser),np.amax(uu_ser))
+print(np.amin(vv_ser),np.amax(vv_ser))
+print(np.amin(ww_ser),np.amax(ww_ser))
+
+# set normalized uvw - mesh conversion factors
+dx = 1.0/grid_size
+dw = 1.0/num_w_planes
+
+readtime1 = time.time()
+
+if rank == 0:
+ outfile = outpath+ufile
+ uu_ser.tofile(outfile,sep='')
+ outfile = outpath+vfile
+ vv_ser.tofile(outfile,sep='')
+ outfile = outpath+wfile
+ ww_ser.tofile(outfile,sep='')
+ outfile = outpath+weights
+ weight_ser.tofile(outfile,sep='')
+ outfile = outpath+weights
+ weight_ser.tofile(outfile,sep='')
+ outfile = outpath+visrealfile
+ vis_ser_real.tofile(outfile,sep='')
+ outfile = outpath+visimgfile
+ vis_ser_img.tofile(outfile,sep='')
+ outfile = outpath+metafile
+ f = open(outfile, 'w')
+ f.writelines(str(uu_ser.size)+"\n")
+ f.writelines(str(vis_ser_real.size)+"\n")
+ f.writelines(str(vis.shape[1])+"\n")
+ f.writelines(str(vis.shape[2])+"\n")
+ f.writelines(str(Ntime)+"\n")
+ f.writelines(str(timepersample)+"\n")
+ f.writelines(str(timepersample*Ntime/3600)+"\n")
+ f.writelines(str(Nbaselines)+"\n")
+ f.writelines(str(ming)+"\n")
+ f.writelines(str(maxg)+"\n")
+ f.writelines(str(minw)+"\n")
+ f.writelines(str(maxw)+"\n")
+ f.close()
+
+