From e658f6bb07ecd910ca1f62f607e5fbe73e1f0453 Mon Sep 17 00:00:00 2001
From: Emanuele De Rubeis <ederubei@login01.leonardo.local>
Date: Fri, 22 Sep 2023 19:20:58 +0200
Subject: [PATCH] NVHPC 23.5 updates, cufftMp working, correct WEIGHTS from MS

---
 Build/Makefile.leo       |  18 ++--
 Makefile                 |  14 +--
 README.md                |   7 ++
 scripts/bench.go_cufftmp | 111 ++++++++++++++++++++
 scripts/create_binMS.py  | 218 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 352 insertions(+), 16 deletions(-)
 create mode 100644 scripts/bench.go_cufftmp
 create mode 100644 scripts/create_binMS.py

diff --git a/Build/Makefile.leo b/Build/Makefile.leo
index 5e9aaab..ce350a1 100644
--- a/Build/Makefile.leo
+++ b/Build/Makefile.leo
@@ -9,8 +9,8 @@ OPT_PURE_MPI = -O4 -march=native -mavx -mavx2
 
 OMP_GPU = -mp=multicore,gpu -gpu=cuda11.8 -gpu=cc80
 
-CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/include
-CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/targets/x86_64-linux/lib/stubs
+CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/include
+CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/targets/x86_64-linux/lib/stubs
 
 FFTW_INCL=
 FFTW_LIB=
@@ -19,18 +19,18 @@ FFTW_LIB=
 ##########################################################
 #NVIDIA CUFFTMP
 
-CUFFTMP_LIB  = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/lib64
-CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/include/cufftmp
+CUFFTMP_LIB  = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64
+CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/include/cufftmp
 ##########################################################
 
-NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/include
-NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/lib
+NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/include/
+NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/
 
 ##########################################################
 #NVIDIA NCCL REDUCE
 
-NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/include
-NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/lib
+NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/include
+NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/lib
 ##########################################################
 
 NVC = nvc 
@@ -44,7 +44,7 @@ NVLIB_2 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(CUFFTMP_INC) $(CUFFTMP
 NVLIB_3 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(NCCL_INC) $(NCCL_LIB) -lcudart -lnccl
 
 
-NVCC = /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/11.8/bin/nvcc
+NVCC = nvcc
 OPT_NVCC   = -std=c++17 --generate-code arch=compute_80,code=sm_80 
 
 CFLAGS +=
diff --git a/Makefile b/Makefile
index fde90c6..cee8b8b 100644
--- a/Makefile
+++ b/Makefile
@@ -35,10 +35,10 @@ FFTWLIBS =
 OPT += -DUSE_FFTW
 
 # use omp-ized version of fftw routines
-OPT += -DHYBRID_FFTW
+#OPT += -DHYBRID_FFTW
 
 # switch on the OpenMP parallelization
-OPT += -DUSE_OMP
+#OPT += -DUSE_OMP
 
 # write the full 3D cube of gridded visibilities and its FFT transform
 #OPT += -DWRITE_DATA
@@ -53,7 +53,7 @@ OPT += -DPHASE_ON
 #OPT += -DFITSIO
 
 # Perform true parallel images writing
-#OPT += -DPARALLELIO
+OPT += -DPARALLELIO
 
 # Normalize uvw in case it is not done in the binMS
 #OPT += -DNORMALIZE_UVW
@@ -74,13 +74,13 @@ OPT += -DGAUSS_HI_PRECISION
 #OPT += -DNVIDIA
 
 #use cuda for GPUs
-#OPT += -DCUDACC
+OPT += -DCUDACC
 
 # use GPU acceleration via OMP 
-OPT += -DACCOMP
+#OPT += -DACCOMP
 
 # perform stacking on GPUs
-#OPT += -DGPU_STACKING
+OPT += -DGPU_STACKING
 
 # use NVIDIA GPU to perform the reduce
 #OPT += -DNCCL_REDUCE
@@ -89,7 +89,7 @@ OPT += -DACCOMP
 #OPT += -DRCCL_REDUCE
 
 # use GPU to perform FFT
-#OPT += -DCUFFTMP
+OPT += -DCUFFTMP
 
 #support for AMD GPUs
 #OPT += __HIP_PLATFORM_AMD__
diff --git a/README.md b/README.md
index 6f3ed77..48c4431 100644
--- a/README.md
+++ b/README.md
@@ -50,5 +50,12 @@ In the case in which the code has been compiled without either -fopenmp or -D_OP
 the code is forced to use the standard MPI_Reduce implementation, since our reduce works
 only with OpenMP.
 
+To use the cufftMp with nvhpc 23.5 you have to add the following paths to the environmental variable ´LD_LIBRARY_PATH´:
 
+###########################################
+
+export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/:$LD_LIBRARY_PATH"
+
+export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64/:$LD_LIBRARY_PATH"
 
+###########################################
\ No newline at end of file
diff --git a/scripts/bench.go_cufftmp b/scripts/bench.go_cufftmp
new file mode 100644
index 0000000..31f9263
--- /dev/null
+++ b/scripts/bench.go_cufftmp
@@ -0,0 +1,111 @@
+#!/bin/bash
+#SBATCH -A IscrC_RICK
+#SBATCH -p boost_usr_prod
+##SBATCH --qos boost_qos_bprod
+#SBATCH -J RICK
+### number of nodes
+#SBATCH -N 1
+### number of MPI tasks per node
+#SBATCH --ntasks-per-node=4
+#SBATCH -n 4
+### number of openmp threads
+#SBATCH --cpus-per-task=8
+### number of allocated GPUs per node
+#SBATCH --gpus-per-node=4
+#SBATCH --mem=450G
+#SBATCH -o test.out
+#SBATCH -e test.err
+#SBATCH -t 03:00:00 
+
+
+module load openmpi/
+module load fftw/
+module load nvhpc/23.5
+module load cuda/
+
+export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/:$LD_LIBRARY_PATH"
+
+export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64/:$LD_LIBRARY_PATH"
+
+export OMPI_CC=gcc
+export OMPI_CXX=g++
+
+export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
+export OMP_PLACES=cores
+
+cd ../
+make -j1 clean
+rm -f w-stacking_fftw_acc-omp_acc-fft
+make -j1 w-stacking
+
+export typestring=omp_gpu_cufftmp
+export exe=w-stacking_fftw_acc-omp_acc-fft
+
+OUT_SHM=result_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK}
+OUT_SHM_RES=/leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/scripts/Tests/times_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK}_large
+
+rm -f ${OUT_SHM} ${OUT_SHM_RES}
+
+export logdir=mpi_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK}
+echo "Creating $logdir"
+rm -fr $logdir
+mkdir $logdir
+
+for itest in {1..3}
+do
+  export logfile=test_${itest}_${logdir}.log
+  echo "time mpirun -np ${SLURM_NTASKS} --bind-to core --map-by ppr:${SLURM_NTASKS_PER_NODE}:node:pe=${SLURM_CPUS_PER_TASK} -x OMP_NUM_THREADS data/paramfile.txt" > $logfile
+  time mpirun -np ${SLURM_NTASKS} --bind-to core --map-by ppr:${SLURM_NTASKS_PER_NODE}:node:pe=${SLURM_CPUS_PER_TASK} -x OMP_NUM_THREADS --mca btl self,vader /leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/${exe} data/paramfile.txt >> $logfile
+  mv $logfile $logdir
+  mv timings.dat ${logdir}/timings_${itest}.dat
+  cat ${logdir}/timings_all.dat ${logdir}/timings_${itest}.dat >> ${logdir}/timings_all.dat
+
+  Reduce_time=$( grep -w 'Reduce time :' $logdir/$logfile | gawk '{print $4}' )
+  FFTW_time=$( grep -w 'cufftMP time :' $logdir/$logfile | gawk '{print $4}' )
+  Composition_time=$( grep -w 'Array Composition time :' $logdir/$logfile | gawk '{print $5}' )
+  Writing_time=$( grep -w ' Image writing time :' $logdir/$logfile | gawk '{print $5}' )
+  Total_time=$( grep -w 'TOT time :' $logdir/$logfile | gawk '{print $4}' )
+
+  #Not relevant for the paper
+  Setup_time=$( grep -w 'Setup time:' $logdir/$logfile | gawk '{print $3}' )
+  Kernel_time=$( grep -w 'Kernel time :' $logdir/$logfile | gawk '{print $4}' )
+  Phase_time=$( grep -w 'Phase time :' $logdir/$logfile | gawk '{print $4}' )
+  ##########################
+
+  
+  echo $itest $Reduce_time $FFTW_time $Composition_time $Writing_time $Total_time $Setup_time $Kernel_time $Phase_time >> ${OUT_SHM}
+done
+
+echo -e "\n\n" >> ${OUT_SHM}
+avg_red=$( awk '{sum+=$2} END { print sum/3 }' ${OUT_SHM} )
+avg_fftw=$( awk '{sum+=$3} END { print sum/3 }' ${OUT_SHM} )
+avg_comp=$( awk '{sum+=$4} END { print sum/3 }' ${OUT_SHM} )
+avg_write=$( awk '{sum+=$5} END { print sum/3 }' ${OUT_SHM} )
+avg_tot=$( awk '{sum+=$6} END { print sum/3 }' ${OUT_SHM} )
+
+std_red=$( awk '{if($2!=""){count++;sum+=$2};y+=$2^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
+std_fftw=$( awk '{if($3!=""){count++;sum+=$3};y+=$3^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
+std_comp=$( awk '{if($4!=""){count++;sum+=$4};y+=$4^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
+std_write=$( awk '{if($5!=""){count++;sum+=$5};y+=$5^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
+std_tot=$( awk '{if($6!=""){count++;sum+=$6};y+=$6^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
+
+
+#Not relevant for the paper
+avg_setup=$( awk '{sum+=$7} END { print sum/3 }' ${OUT_SHM} )
+avg_ker=$( awk '{sum+=$8} END { print sum/3 }' ${OUT_SHM} )
+avg_phase=$( awk '{sum+=$9} END { print sum/3 }' ${OUT_SHM} )
+
+std_setup=$( awk '{if($7!=""){count++;sum+=$7};y+=$7^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
+std_ker=$( awk '{if($8!=""){count++;sum+=$8};y+=$8^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
+std_phase=$( awk '{if($9!=""){count++;sum+=$9};y+=$9^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
+##########################
+
+
+echo "Averages and standard deviations over 3 shots" >> ${OUT_SHM_RES}
+echo -e "\n" ${OUT_SHM_RES}
+echo "${SLURM_NTASKS} MPI tasks;  ${SLURM_CPUS_PER_TASK} OpenMP threads per task;  ${SLURM_GPUS_PER_NODE} GPUs per node;" >> ${OUT_SHM_RES}
+echo -e "\n\n" ${OUT_SHM_RES}
+echo $avg_red $std_red $avg_fftw $std_fftw $avg_comp $std_comp $avg_write $std_write $avg_tot $std_tot >> ${OUT_SHM_RES}
+echo -e "\n\n" ${OUT_SHM_RES}
+echo $avg_setup $std_setup $avg_ker $std_ker $avg_phase $std_phase >> ${OUT_SHM_RES} 
+rm -f ${OUT_SHM}
diff --git a/scripts/create_binMS.py b/scripts/create_binMS.py
new file mode 100644
index 0000000..f5c2937
--- /dev/null
+++ b/scripts/create_binMS.py
@@ -0,0 +1,218 @@
+USE_MPI = 0
+
+import numpy as np
+import casacore.tables as pt
+import time
+import sys
+import os
+
+#outpath = '/data/gridding/data/shortgauss_t201806301100_SBH255.binMS/'
+print(sys.argv[1])
+outpath = "/data/gridding/data/Lofarbig/"+sys.argv[1]+".binMS/"
+os.mkdir(outpath)
+
+
+ufile = 'ucoord.bin'
+vfile = 'vcoord.bin'
+wfile = 'wcoord.bin'
+weights = 'weights.bin'
+visrealfile = 'visibilities_real.bin'
+visimgfile = 'visibilities_img.bin'
+metafile = 'meta.txt'
+
+offset = 0.0
+
+if USE_MPI == 1:
+   from mpi4py import MPI
+   comm = MPI.COMM_WORLD
+   rank = comm.Get_rank()
+   size = comm.Get_size()
+   print(rank,size)
+else:
+   comm = 0
+   rank = 0
+   size = 1
+
+num_threads = 1
+
+# input MS
+readtime0 = time.time()
+#msfile = "/data/Lofar-data/results/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.ms"
+msfile = "/data/Lofar-Luca/results/"+sys.argv[1]+".ms/"
+ms = pt.table(msfile, readonly=True, ack=False)
+
+if rank == 0:
+ print("Reading ", msfile)
+ print("Writing ", outpath)
+# load data and metadata
+ with pt.table(msfile + '::SPECTRAL_WINDOW', ack=False) as freqtab:
+    freq = freqtab.getcol('REF_FREQUENCY')[0] / 1000000.0
+    freqpersample = np.mean(freqtab.getcol('RESOLUTION'))
+    timepersample = ms.getcell('INTERVAL',0)
+
+ print("Frequencies (MHz)   : ",freq)
+ print("Time interval (sec) : ",timepersample)
+
+ with pt.taql("SELECT ANTENNA1,ANTENNA2,sqrt(sumsqr(UVW)),GCOUNT() FROM $ms GROUPBY ANTENNA1,ANTENNA2") as BL:
+    ants1, ants2 = BL.getcol('ANTENNA1'), BL.getcol('ANTENNA2')
+    Ntime = BL.getcol('Col_4')[0] # number of timesteps
+    Nbaselines = len(ants1)
+
+ print("Number of timesteps : ",Ntime)
+ print("Total obs time (hrs): ",timepersample*Ntime/3600)
+ print("Number of baselines : ",Nbaselines)
+
+#sp = pt.table(msfile+'::LOFAR_ANTENNA_FIELD', readonly=True, ack=False, memorytable=True).getcol('POSITION')
+
+ ant1, ant2 = ms.getcol('ANTENNA1'), ms.getcol('ANTENNA2')
+
+ number_of_measures = Ntime * Nbaselines
+ #nm_pe_aux = int(number_of_measures / size)
+ #remaining_aux = number_of_measures % size
+ nm_pe = np.array(0) 
+ nm_pe = int(number_of_measures / size)
+ remaining = np.array(0) 
+ remaining = number_of_measures % size
+ print(nm_pe,remaining)
+
+else:
+ nm_pe = None
+ remaining = None
+
+if USE_MPI == 1:
+ nm_pe = comm.bcast(nm_pe, root=0)
+ remaining = comm.bcast(remaining, root=0)
+
+# set the data domain for each MPI rank
+startrow = rank*nm_pe
+
+if rank == size-1:
+   nm_pe = nm_pe+remaining
+print(rank,nm_pe,remaining)
+
+nrow = nm_pe
+
+# read data
+uvw = ms.getcol('UVW',startrow,nrow)
+vis = ms.getcol('DATA',startrow,nrow)
+weight = ms.getcol('WEIGHT_SPECTRUM',startrow,nrow)
+print("Freqs per channel   : ",vis.shape[1])
+print("Polarizations       : ",vis.shape[2])
+print("Number of observations : ",uvw.shape[0])
+print("Data size (MB)      : ",uvw.shape[0]*vis.shape[1]*vis.shape[2]*2*4/1024.0/1024.0)
+
+# set parameters
+num_points = uvw.shape[0]
+num_w_planes = 1 
+grid_size = 100    # number of cells of the grid
+
+# serialize arrays
+vis_ser_real = vis.real.flatten()
+vis_ser_img = vis.imag.flatten()
+print("data types: uvw = ",uvw.dtype," vis = ",vis_ser_real.dtype)
+#vis_ser = np.zeros(2*vis_ser_real.size)
+#for i in range(vis_ser_real.size):
+#    vis_ser[2*i]=vis_ser_real[i]
+#    vis_ser[2*i+1]=vis_ser_img[i]
+
+uu_ser = uvw[:,0].flatten()
+vv_ser = uvw[:,1].flatten()
+ww_ser = uvw[:,2].flatten()
+weight_ser = weight.flatten()
+grid = np.zeros(2*num_w_planes*grid_size*grid_size)  # complex!
+gridtot = np.zeros(2*num_w_planes*grid_size*grid_size)  # complex!
+peanokeys = np.empty(vis_ser_real.size,dtype=np.uint64)
+gsize = grid.size
+
+hist, bin_edges = np.histogram(ww_ser,num_w_planes)
+print(hist)
+
+print(vis_ser_real.dtype)
+
+# normalize uv
+minu = np.amin(uu_ser)
+maxu = np.amax(abs(uu_ser))
+minv = np.amin(vv_ser)
+maxv = np.amax(abs(vv_ser))
+minw = np.amin(ww_ser)
+maxw = np.amax(ww_ser)
+
+if USE_MPI == 1:
+   maxu_all = np.array(0,dtype=np.float)
+   maxv_all = np.array(0,dtype=np.float)
+   maxw_all = np.array(0,dtype=np.float)
+   minu_all = np.array(0,dtype=np.float)
+   minv_all = np.array(0,dtype=np.float)
+   minw_all = np.array(0,dtype=np.float)
+   comm.Allreduce(maxu, maxu_all, op=MPI.MAX)
+   comm.Allreduce(maxv, maxv_all, op=MPI.MAX)
+   comm.Allreduce(maxw, maxw_all, op=MPI.MAX)
+   comm.Allreduce(minu, minu_all, op=MPI.MIN)
+   comm.Allreduce(minv, minv_all, op=MPI.MIN)
+   comm.Allreduce(minw, minw_all, op=MPI.MIN)
+
+   ming = min(minu_all,minv_all)
+   maxg = max(maxu_all,maxv_all)
+   minw = minw_all
+   maxw = maxw_all
+   ming = ming-offset*ming
+   maxg = maxg+offset*maxg
+   minw = minw
+   maxw = maxw
+else:
+   ming = min(minu,minv)
+   maxg = max(maxu,maxv)
+   ming = ming-offset*ming
+   maxg = maxg+offset*maxg
+   minw = minw
+   maxw = maxw
+
+print(maxu,maxv,maxg)
+#uu_ser = (uu_ser-ming)/(maxg-ming)
+#vv_ser = (vv_ser-ming)/(maxg-ming)
+uu_ser = (uu_ser+maxg)/(2*maxg)
+vv_ser = (vv_ser+maxg)/(2*maxg)
+ww_ser = (ww_ser-minw)/(maxw-minw)
+#print(uu_ser.shape, vv_ser.dtype, ww_ser.dtype, vis_ser_real.shape, vis_ser_img.dtype, weight_ser.dtype, grid.dtype)
+print(np.amin(uu_ser),np.amax(uu_ser))
+print(np.amin(vv_ser),np.amax(vv_ser))
+print(np.amin(ww_ser),np.amax(ww_ser))
+
+# set normalized uvw - mesh conversion factors
+dx = 1.0/grid_size
+dw = 1.0/num_w_planes
+
+readtime1 = time.time()
+
+if rank == 0:
+ outfile = outpath+ufile
+ uu_ser.tofile(outfile,sep='')
+ outfile = outpath+vfile
+ vv_ser.tofile(outfile,sep='')
+ outfile = outpath+wfile
+ ww_ser.tofile(outfile,sep='')
+ outfile = outpath+weights
+ weight_ser.tofile(outfile,sep='')
+ outfile = outpath+weights
+ weight_ser.tofile(outfile,sep='')
+ outfile = outpath+visrealfile
+ vis_ser_real.tofile(outfile,sep='')
+ outfile = outpath+visimgfile
+ vis_ser_img.tofile(outfile,sep='')
+ outfile = outpath+metafile
+ f = open(outfile, 'w')
+ f.writelines(str(uu_ser.size)+"\n")
+ f.writelines(str(vis_ser_real.size)+"\n")
+ f.writelines(str(vis.shape[1])+"\n")
+ f.writelines(str(vis.shape[2])+"\n")
+ f.writelines(str(Ntime)+"\n")
+ f.writelines(str(timepersample)+"\n")
+ f.writelines(str(timepersample*Ntime/3600)+"\n")
+ f.writelines(str(Nbaselines)+"\n")
+ f.writelines(str(ming)+"\n")
+ f.writelines(str(maxg)+"\n")
+ f.writelines(str(minw)+"\n")
+ f.writelines(str(maxw)+"\n")
+ f.close()
+
+
-- 
GitLab