diff --git a/Build/Makefile.pleiadi b/Build/Makefile.pleiadi
new file mode 100644
index 0000000000000000000000000000000000000000..2dc177e987050d757d293dc04a101f3d54b67ba9
--- /dev/null
+++ b/Build/Makefile.pleiadi
@@ -0,0 +1,26 @@
+
+CC       =  gcc
+CXX      =  g++
+
+MPICC    =  mpicc
+MPIC++   =  mpiCC
+
+
+#FFTW_INCL=  -I/homes/gheller/Pleiadi/fftw-3.3.10/local/include/
+#FFTW_LIB=  /homes/gheller/Pleiadi/fftw-3.3.10/local/lib/
+FFTW_INCL=  -I/homes/gheller/Pleiadi/fftw-3.3.10/local-mpich/include/
+FFTW_LIB=  /homes/gheller/Pleiadi/fftw-3.3.10/local-mpich/lib/
+
+FITSIO_INCL= -I/iranet/arcsoft/cfitsio/cfitsio-4.1.0/include/
+FITSIO_LIB= /iranet/arcsoft/cfitsio/cfitsio-4.1.0/lib/
+
+
+NVCC = nvcc
+NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
+NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
+
+OMP= -fopenmp
+
+CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) $(FITSIO_INCL)
+
+OPTIMIZE = $(OMP) -O3 -mtune=native
diff --git a/Makefile b/Makefile
index 3302f885a7fd95bf510a348ae97945a0838cbc00..9d6921cc0895d8139818e537a00c10cdf7d05f2e 100644
--- a/Makefile
+++ b/Makefile
@@ -33,13 +33,22 @@ OPT += -DNVIDIA
 # write the final image
 OPT += -DWRITE_IMAGE
 # perform w-stacking phase correction
+<<<<<<< HEAD
 OPT += -DPHASE_ON
 # GPU support for FFT using cuFFTMP
 OPT += -DCUDA_FFT
 # Support CFITSIO
+=======
+#OPT += -DPHASE_ON
+# Support CFITSIO !!! Remember to add the path to the CFITSIO library to LD_LIBRARY_PATH
+>>>>>>> main
 #OPT += -DFITSIO
 # Perform true parallel images writing
 #OPT += -DPARALLELIO
+# Normalize uvw in case it is not done in the binMS
+OPT += -DNORMALIZE_UVW
+# Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL
+OPT += -DGAUSS_HI_PRECISION
 
 ifeq (FITSIO,$(findstring FITSIO,$(OPT)))
 	LIBS += -L$(FITSIO_LIB) -lcfitsio
diff --git a/scripts/copycolumn.py b/scripts/copycolumn.py
new file mode 100644
index 0000000000000000000000000000000000000000..f61d99945d0ee5ac97b4aaad24e2ed1587b8a4f9
--- /dev/null
+++ b/scripts/copycolumn.py
@@ -0,0 +1,25 @@
+import sys
+import pyrap
+from pyrap.tables import table, maketabdesc, makecoldesc
+from pyrap.tables import tablecolumn
+
+print(sys.argv[1])
+
+#inputMS = '/local/work/gheller/mock/test1hr_t201806301100_SBL150.MS'
+#inputMS = 'gauss_t201806301100_SBL180.MS'
+#inputMS = 'gauss_t201806301100_SBH300.MS'
+#inputMS = 'C_RADIO30MHz500_t201806301100_SBL180.MS'
+#inputMS = 'C_sky802_t201806301100_SBL180.MS'
+#inputMS = 'onehour_10asec_t201806301100_SBL180.MS'
+#oinputMS = 'sixhours_t201806301100_SBL180.MS'
+
+inputMS = sys.argv[1]
+t=table(inputMS,readonly=False)
+
+print("     ")
+print("Columns in the MS")
+print(t.colnames())
+print("     ")
+
+t.putcol('DATA', t.getcol('MODEL_DATA'))
+t.flush()
diff --git a/scripts/create_binMS.py b/scripts/create_binMS.py
index 1a44996a1ec48f00c172c3716c4f5e3c32ab5326..00e1a8f3b38f9a82c3ae20a58b86f005e1d02ee5 100644
--- a/scripts/create_binMS.py
+++ b/scripts/create_binMS.py
@@ -8,7 +8,7 @@ import os
 
 #outpath = '/data/gridding/data/shortgauss_t201806301100_SBH255.binMS/'
 print(sys.argv[1])
-outpath = "/data/gridding/data/Lofarbig/"+sys.argv[1]+".binMS/"
+outpath = "/homes/gheller/lofar2/gheller/binMS/"+sys.argv[1]+".binMS/"
 os.mkdir(outpath)
 
 
@@ -38,7 +38,7 @@ num_threads = 1
 # input MS
 readtime0 = time.time()
 #msfile = "/data/Lofar-data/results/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.ms"
-msfile = "/data/Lofar-Luca/results/"+sys.argv[1]+".ms/"
+msfile = "/homes/gheller/lofar2/gheller/MS/"+sys.argv[1]+".ms/"
 ms = pt.table(msfile, readonly=True, ack=False)
 
 if rank == 0:
diff --git a/scripts/create_lofar_obs.py b/scripts/create_lofar_obs.py
new file mode 100755
index 0000000000000000000000000000000000000000..3e04b5d1babb98544906c5976d87d038abe1c2fa
--- /dev/null
+++ b/scripts/create_lofar_obs.py
@@ -0,0 +1,113 @@
+import subprocess
+import sys
+import time
+
+# input data
+datadir = "cgheller@login.m100.cineca.it://m100_scratch/userexternal/cgheller/Lofar/data_model"
+cleandir = "cgheller@login.m100.cineca.it://m100_scratch/userexternal/cgheller/Lofar/data_wsclean"
+dirtydir = "cgheller@login.m100.cineca.it://m100_scratch/userexternal/cgheller/Lofar/data_wsdirty"
+filename_prefix = "RADIO30MHz"
+file_ext = ".fits"
+
+# measurement set
+workdir = "."
+ms_name = "hba-10hrs_t201806301100_SBH255.MS"    #"visibilities_aux.MS"
+ms_tar = workdir+"/"+ms_name+"-template.tar"
+
+# general parameters
+resolution = "2.0asec"
+size = "2000"
+
+# command for creating visibilities
+vis_command = "wsclean"
+
+# parameters for the noise
+noise_command = "../useful_scripts/noise.py"
+noise_factor = "0.0001"
+
+# command for imaging
+img_command = "wsclean"
+img_parallel = "4"
+img_iter = "10000"
+#img_options = "-weight briggs 0 -taper-gaussian 60 -apply-primary-beam -reorder -niter 10000 -mgain 0.8 -auto-threshold 5"
+#img_options = "-apply-primary-beam -reorder -niter 10000 -mgain 0.8 -auto-threshold 5"
+
+## create empty MS
+## LBA 30 MHz
+#/opt/losito/bin/synthms --name ms_name --tobs 1 --ra 1.0 --dec 1.57 --lofarversion 1 --station LBA -minfreq 30e6 --maxfreq 30e6
+## HBA 150 MHz
+#/opt/losito/bin/synthms --name dataset --tobs 1 --ra 1.0 --dec 1.57 --lofarversion 1 --station HBA --minfreq 150e6 --maxfreq 150e6
+#
+#tar cvf ms_tar ms_name
+
+tinverse = 0.0
+tcopy = 0.0
+tnoise = 0.0
+timaging = 0.0
+tget = 0.0
+tput = 0.0
+
+for id in range(1,2):
+    imgname = filename_prefix+str(id)
+    #fitsfilename = datadir+'/'+imgname+file_ext
+    fitsfilename = datadir+'/'+filename_prefix+file_ext
+    commandline = "scp "+datadir+"/"+imgname+file_ext+" ."
+    t0 = time.time()
+    error = subprocess.call(commandline,shell=True)
+    error = subprocess.run(["cp",imgname+file_ext,"test-model.fits"])
+    tget = tget+(time.time()-t0)
+
+    # untar the template MS
+    error = subprocess.run(["tar","xvf",ms_tar])
+
+    # create the visibilities
+    t0 = time.time()
+    error = subprocess.run([vis_command,"-j",img_parallel,"-scale",resolution,"--predict","-use-idg","-grid-with-beam","-name", "test", ms_name])
+    tinverse = tinverse+(time.time()-t0)
+
+    # Copy the MODEL_DATA column int the DATA column
+    t0 = time.time()
+    error = subprocess.run(["python","../useful_scripts/copycolumn.py",ms_name])
+    tcopy = tcopy+(time.time()-t0)
+
+    # Add noise
+    t0 = time.time()
+    error = subprocess.run([noise_command,"--msin",ms_name,"--factor",noise_factor])
+    tnoise = tnoise+(time.time()-t0)
+
+    # Imaging
+    t0 = time.time()
+    error = subprocess.run([img_command,"-j",img_parallel,"-apply-primary-beam","-reorder","-niter",img_iter,\
+                            "-mgain", "0.8", "-auto-threshold", "5", "-size",size,size,"-scale",resolution,\
+                            "-name",imgname,ms_name])
+    timaging = timaging+(time.time()-t0)
+
+    # remove at the end the measurement set
+    error = subprocess.run(["rm","-fr",workdir+"/"+ms_name])
+    error = subprocess.run(["rm","test-model.fits"])
+    error = subprocess.run(["mkdir",imgname])
+    #commandline = "rm *beam*.fits"
+    #error = subprocess.Popen(commandline, shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+    t0 = time.time()
+    commandline = "scp "+imgname+"-dirty.fits"+" "+dirtydir
+    error = subprocess.call(commandline, shell=True)
+    commandline = "scp "+imgname+"-image-pb.fits"+" "+cleandir
+    error = subprocess.call(commandline, shell=True)
+    tput = tput+(time.time()-t0)
+    commandline = "mv *.fits "+imgname
+    error = subprocess.call(commandline, shell=True)
+##    error = subprocess.run(["tar","cvf",imgname+".tar",imgname])
+    error = subprocess.run(["rm","-fr",imgname])
+    #commandline = "tar cvf "+imgname+".tar *.fits"
+    #error = subprocess.Popen(commandline, shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+    #commandline = "rm *.fits"
+    #error = subprocess.Popen(commandline, shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+
+
+print("Summary of timings: ")
+print("Get data (sec)     = ",tget)
+print("Put data (sec)     = ",tput)
+print("Inverse time (sec) = ",tinverse)
+print("Copy time (sec)    = ",tcopy)
+print("Noise time (sec)   = ",tnoise)
+print("Imaging time (sec) = ",timaging)
diff --git a/scripts/createmulti_binMS.py b/scripts/createmulti_binMS.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6492950318293df0b6cf2d7d59884305a015e94
--- /dev/null
+++ b/scripts/createmulti_binMS.py
@@ -0,0 +1,191 @@
+USE_MPI = 0
+
+import numpy as np
+import casacore.tables as pt
+import time
+import sys
+import os
+
+startfreq = 0
+endfreq = 1
+stepfreq = 5
+
+#outpath = '/data/gridding/data/shortgauss_t201806301100_SBH255.binMS/'
+print(sys.argv[1])
+outpath = "/homes/gheller/lofar2/gheller/binMS/"+sys.argv[1]+str(startfreq)+"-"+str(endfreq)+"-"+str(stepfreq)+".binMS/"
+os.mkdir(outpath)
+
+
+ufile = 'ucoord.bin'
+vfile = 'vcoord.bin'
+wfile = 'wcoord.bin'
+weights = 'weights.bin'
+visrealfile = 'visibilities_real.bin'
+visimgfile = 'visibilities_img.bin'
+metafile = 'meta.txt'
+
+offset = 0.0
+
+if USE_MPI == 1:
+   from mpi4py import MPI
+   comm = MPI.COMM_WORLD
+   rank = comm.Get_rank()
+   size = comm.Get_size()
+   print(rank,size)
+else:
+   comm = 0
+   rank = 0
+   size = 1
+
+num_threads = 1
+
+# input MS
+readtime0 = time.time()
+
+num_points_tot = 0
+num_vis_tot = 0
+for ifreq in range(startfreq,endfreq,stepfreq):
+    freqlabel = format(ifreq,'04d')
+    msfile = "/homes/gheller/lofar2/gheller/MS/"+sys.argv[1]+freqlabel+".ms/"
+    ms = pt.table(msfile, readonly=True, ack=False)
+
+    if rank == 0:
+     print("Reading ", msfile)
+     print("Writing ", outpath)
+# load data and metadata
+     with pt.table(msfile + '::SPECTRAL_WINDOW', ack=False) as freqtab:
+        freq = freqtab.getcol('REF_FREQUENCY')[0] / 1000000.0
+        freqpersample = np.mean(freqtab.getcol('RESOLUTION'))
+        timepersample = ms.getcell('INTERVAL',0)
+
+     print("Frequencies (MHz)   : ",freq)
+     print("Time interval (sec) : ",timepersample)
+
+     with pt.taql("SELECT ANTENNA1,ANTENNA2,sqrt(sumsqr(UVW)),GCOUNT() FROM $ms GROUPBY ANTENNA1,ANTENNA2") as BL:
+        ants1, ants2 = BL.getcol('ANTENNA1'), BL.getcol('ANTENNA2')
+        Ntime = BL.getcol('Col_4')[0] # number of timesteps
+        Nbaselines = len(ants1)
+
+     print("Number of timesteps : ",Ntime)
+     print("Total obs time (hrs): ",timepersample*Ntime/3600)
+     print("Number of baselines : ",Nbaselines)
+
+#sp = pt.table(msfile+'::LOFAR_ANTENNA_FIELD', readonly=True, ack=False, memorytable=True).getcol('POSITION')
+
+     ant1, ant2 = ms.getcol('ANTENNA1'), ms.getcol('ANTENNA2')
+
+     number_of_measures = Ntime * Nbaselines
+     #nm_pe_aux = int(number_of_measures / size)
+     #remaining_aux = number_of_measures % size
+     nm_pe = np.array(0) 
+     nm_pe = int(number_of_measures / size)
+     remaining = np.array(0) 
+     remaining = number_of_measures % size
+     print(nm_pe,remaining)
+
+    else:
+     nm_pe = None
+     remaining = None
+
+    if USE_MPI == 1:
+     nm_pe = comm.bcast(nm_pe, root=0)
+     remaining = comm.bcast(remaining, root=0)
+
+# set the data domain for each MPI rank
+    startrow = rank*nm_pe
+
+    if rank == size-1:
+     nm_pe = nm_pe+remaining
+    print(rank,nm_pe,remaining)
+
+    nrow = nm_pe
+
+# read data
+    uvw = ms.getcol('UVW',startrow,nrow)
+    vis = ms.getcol('DATA',startrow,nrow)
+    weight = ms.getcol('WEIGHT',startrow,nrow)
+    print("Freqs per channel   : ",vis.shape[1])
+    print("Polarizations       : ",vis.shape[2])
+    print("Number of observations : ",uvw.shape[0])
+    print("Data size (MB)      : ",uvw.shape[0]*vis.shape[1]*vis.shape[2]*2*4/1024.0/1024.0)
+    ms.close()
+
+# set parameters
+    num_points = uvw.shape[0]
+    num_points_tot = num_points_tot + num_points
+    num_vis = num_points * vis.shape[1] * vis.shape[2]
+    num_vis_tot = num_vis_tot + num_vis
+    num_w_planes = 1 
+
+# serialize arrays
+    vis_ser_real = vis.real.flatten()
+    vis_ser_img = vis.imag.flatten()
+    print("data types: uvw = ",uvw.dtype," vis = ",vis_ser_real.dtype)
+    #vis_ser = np.zeros(2*vis_ser_real.size)
+    #for i in range(vis_ser_real.size):
+    #    vis_ser[2*i]=vis_ser_real[i]
+    #    vis_ser[2*i+1]=vis_ser_img[i]
+
+    uu_ser = uvw[:,0].flatten()
+    vv_ser = uvw[:,1].flatten()
+    ww_ser = uvw[:,2].flatten()
+    weight_ser = weight.flatten()
+
+    print(np.amin(uu_ser),np.amax(uu_ser))
+    print(np.amin(vv_ser),np.amax(vv_ser))
+    print(np.amin(ww_ser),np.amax(ww_ser))
+
+    readtime1 = time.time()
+
+    outfile = outpath+ufile
+    newFile = open(outfile, "a+b")
+    binary_format = bytearray(uu_ser)
+    newFile.write(binary_format)
+    newFile.close()
+    outfile = outpath+vfile
+    newFile = open(outfile, "a+b")
+    binary_format = bytearray(vv_ser)
+    newFile.write(binary_format)
+    newFile.close()
+    outfile = outpath+wfile
+    newFile = open(outfile, "a+b")
+    binary_format = bytearray(ww_ser)
+    newFile.write(binary_format)
+    newFile.close()
+    outfile = outpath+weights
+    newFile = open(outfile, "a+b")
+    binary_format = bytearray(weight_ser)
+    newFile.write(binary_format)
+    newFile.close()
+    outfile = outpath+visrealfile
+    newFile = open(outfile, "a+b")
+    binary_format = bytearray(vis_ser_real)
+    newFile.write(binary_format)
+    newFile.close()
+    outfile = outpath+visimgfile
+    newFile = open(outfile, "a+b")
+    binary_format = bytearray(vis_ser_img)
+    newFile.write(binary_format)
+    newFile.close()
+
+ming = 0.0
+maxg = 1.0
+minw = 0.0
+maxw = 1.0
+outfile = outpath+metafile
+f = open(outfile, 'w')
+f.writelines(str(num_points_tot)+"\n")
+f.writelines(str(num_vis_tot)+"\n")
+f.writelines(str(vis.shape[1])+"\n")
+f.writelines(str(vis.shape[2])+"\n")
+f.writelines(str(Ntime)+"\n")
+f.writelines(str(timepersample)+"\n")
+f.writelines(str(timepersample*Ntime/3600)+"\n")
+f.writelines(str(Nbaselines)+"\n")
+f.writelines(str(ming)+"\n")
+f.writelines(str(maxg)+"\n")
+f.writelines(str(minw)+"\n")
+f.writelines(str(maxw)+"\n")
+f.close()
+
+
diff --git a/scripts/gridding_driver.py b/scripts/gridding_driver.py
new file mode 100755
index 0000000000000000000000000000000000000000..d0e5b4c827a409239d3b606fe79cc66d4283414b
--- /dev/null
+++ b/scripts/gridding_driver.py
@@ -0,0 +1,230 @@
+USE_MPI = 1
+
+import numpy as np
+import casacore.tables as pt
+import time
+import sys
+if USE_MPI == 1:
+   from mpi4py import MPI
+   comm = MPI.COMM_WORLD
+   rank = comm.Get_rank()
+   size = comm.Get_size()
+   print(rank,size)
+else:
+   comm = 0
+   rank = 0
+   size = 1
+
+num_threads = 1
+if len(sys.argv) > 1:
+    num_threads = int(sys.argv[1])
+print("Run with N. threads = ",num_threads)
+
+# set-up the C-python environment
+import ctypes
+so_file = "src/wprojection.so"
+c_func = ctypes.cdll.LoadLibrary(so_file)
+
+# input MS
+readtime0 = time.time()
+msfile = "./data/gauss2_t201806301100_SBL180.MS"
+ms = pt.table(msfile, readonly=True, ack=False)
+
+if rank == 0:
+# load data and metadata
+ with pt.table(msfile + '::SPECTRAL_WINDOW', ack=False) as freqtab:
+    freq = freqtab.getcol('REF_FREQUENCY')[0] / 1000000.0
+    freqpersample = np.mean(freqtab.getcol('RESOLUTION'))
+    timepersample = ms.getcell('INTERVAL',0)
+
+ print("Frequencies (MHz)   : ",freq)
+ print("Time interval (sec) : ",timepersample)
+
+ with pt.taql("SELECT ANTENNA1,ANTENNA2,sqrt(sumsqr(UVW)),GCOUNT() FROM $ms GROUPBY ANTENNA1,ANTENNA2") as BL:
+    ants1, ants2 = BL.getcol('ANTENNA1'), BL.getcol('ANTENNA2')
+    Ntime = BL.getcol('Col_4')[0] # number of timesteps
+    Nbaselines = len(ants1)
+
+ print("Number of timesteps : ",Ntime)
+ print("Total obs time (hrs): ",timepersample*Ntime/3600)
+ print("Number of baselines : ",Nbaselines)
+
+#sp = pt.table(msfile+'::LOFAR_ANTENNA_FIELD', readonly=True, ack=False, memorytable=True).getcol('POSITION')
+
+ ant1, ant2 = ms.getcol('ANTENNA1'), ms.getcol('ANTENNA2')
+
+ number_of_measures = Ntime * Nbaselines
+ #nm_pe_aux = int(number_of_measures / size)
+ #remaining_aux = number_of_measures % size
+ nm_pe = np.array(0) 
+ nm_pe = int(number_of_measures / size)
+ remaining = np.array(0) 
+ remaining = number_of_measures % size
+ print(nm_pe,remaining)
+
+else:
+ nm_pe = None
+ remaining = None
+
+nm_pe = comm.bcast(nm_pe, root=0)
+remaining = comm.bcast(remaining, root=0)
+
+# set the data domain for each MPI rank
+startrow = rank*nm_pe
+
+if rank == size-1:
+   nm_pe = nm_pe+remaining
+print(rank,nm_pe,remaining)
+
+nrow = nm_pe
+
+# read data
+uvw = ms.getcol('UVW',startrow,nrow)
+vis = ms.getcol('DATA',startrow,nrow)
+weight = ms.getcol('WEIGHT',startrow,nrow)
+print("Freqs per channel   : ",vis.shape[1])
+print("Polarizations       : ",vis.shape[2])
+print("Number of observations : ",uvw.shape[0])
+print("Data size (MB)      : ",uvw.shape[0]*vis.shape[1]*vis.shape[2]*2*4/1024.0/1024.0)
+
+# set parameters
+num_points = uvw.shape[0]
+num_w_planes = 1 
+grid_size = 100    # number of cells of the grid
+
+# serialize arrays
+vis_ser_real = vis.real.flatten()
+vis_ser_img = vis.imag.flatten()
+uu_ser = uvw[:,0].flatten()
+vv_ser = uvw[:,1].flatten()
+ww_ser = uvw[:,2].flatten()
+weight_ser = weight.flatten()
+grid = np.zeros(2*num_w_planes*grid_size*grid_size)  # complex!
+gridtot = np.zeros(2*num_w_planes*grid_size*grid_size)  # complex!
+peanokeys = np.empty(vis_ser_real.size,dtype=np.uint64)
+gsize = grid.size
+
+hist, bin_edges = np.histogram(ww_ser,num_w_planes)
+print(hist)
+
+print(vis_ser_real.dtype)
+
+# normalize uv
+minu = np.amin(uu_ser)
+maxu = np.amax(uu_ser)
+minv = np.amin(vv_ser)
+maxv = np.amax(vv_ser)
+minw = np.amin(ww_ser)
+maxw = np.amax(ww_ser)
+
+if USE_MPI == 1:
+   maxu_all = np.array(0,dtype=np.float)
+   maxv_all = np.array(0,dtype=np.float)
+   maxw_all = np.array(0,dtype=np.float)
+   minu_all = np.array(0,dtype=np.float)
+   minv_all = np.array(0,dtype=np.float)
+   minw_all = np.array(0,dtype=np.float)
+   comm.Allreduce(maxu, maxu_all, op=MPI.MAX)
+   comm.Allreduce(maxv, maxv_all, op=MPI.MAX)
+   comm.Allreduce(maxw, maxw_all, op=MPI.MAX)
+   comm.Allreduce(minu, minu_all, op=MPI.MIN)
+   comm.Allreduce(minv, minv_all, op=MPI.MIN)
+   comm.Allreduce(minw, minw_all, op=MPI.MIN)
+
+   ming = min(minu_all,minv_all)
+   maxg = max(maxu_all,maxv_all)
+   minw = minw_all
+   maxw = maxw_all
+
+uu_ser = (uu_ser-ming)/(maxg-ming)
+vv_ser = (vv_ser-ming)/(maxg-ming)
+ww_ser = (ww_ser-minw)/(maxw-minw)
+print(uu_ser.shape, vv_ser.dtype, ww_ser.dtype, vis_ser_real.shape, vis_ser_img.dtype, weight_ser.dtype, grid.dtype)
+
+# set normalized uvw - mesh conversion factors
+dx = 1.0/grid_size
+dw = 1.0/num_w_planes
+
+readtime1 = time.time()
+# calculate peano keys
+t0 = time.time()
+bits = 4
+psize = 2**bits
+for ip in range(nm_pe):
+    xp = int(uu_ser[ip] * psize)
+    yp = int(vv_ser[ip] * psize)
+    #zp = int(ww_ser[ip] * psize)
+    zp = int(0.0 * psize)
+    peanokeys[ip] = c_func.peano_hilbert_key(ctypes.c_int(xp), ctypes.c_int(yp), ctypes.c_int(zp), ctypes.c_int(bits))
+
+# set convolutional kernel parameters full size 
+w_support = 7
+
+# process data
+c_func.wstack(ctypes.c_int(num_w_planes),
+              ctypes.c_long(num_points),
+              ctypes.c_long(vis.shape[1]),
+              ctypes.c_long(vis.shape[2]),
+              ctypes.c_void_p(uu_ser.ctypes.data),
+              ctypes.c_void_p(vv_ser.ctypes.data),
+              ctypes.c_void_p(ww_ser.ctypes.data),
+              ctypes.c_void_p(vis_ser_real.ctypes.data),
+              ctypes.c_void_p(vis_ser_img.ctypes.data),
+              ctypes.c_void_p(weight_ser.ctypes.data),
+              ctypes.c_double(dx),
+              ctypes.c_double(dw),
+              ctypes.c_int(w_support),
+              ctypes.c_int(grid_size),
+              ctypes.c_void_p(grid.ctypes.data),
+              ctypes.c_int(num_threads))
+tprocess = time.time() - t0
+
+# reduce results
+print("====================================================================")
+if USE_MPI == 1:
+   nbuffer = 500
+
+   comm.Reduce(grid,gridtot,op=MPI.SUM, root=0)
+else:
+   gridtot = grid
+print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
+
+###nnn = 45
+###mmm = c_func.test(ctypes.c_int(nnn))
+###print("---------> ",nnn,mmm)
+
+#hist, bin_edges = np.histogram(ww_ser,10)
+#print(hist)
+
+
+if rank == 0:
+ outfile = "grid"+str(rank)+".txt"
+ f = open(outfile, 'w')
+
+# write results
+ for iw in range(num_w_planes):
+     for iv in range(grid_size):
+         for iu in range(grid_size):
+            index = 2*(iu + iv*grid_size + iw*grid_size*grid_size)
+
+            v_norm = np.sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1])
+            f.writelines(str(iu)+" "+str(iv)+" "+str(iw)+" "+str(gridtot[index])+" "+str(gridtot[index+1])+" "+str(v_norm)+"\n")
+ f.close()
+
+#outfile = "data"+str(rank)+".txt"
+#f = open(outfile, 'w')
+#
+#for i in range(nm_pe):
+#    #f.writelines(str(uvw_aux[i,0])+" "+str(uvw_aux[i,1])+" "+str(uvw_aux[i,2])+" "+str(peanokeys[i])+"\n")
+#    f.writelines(str(uu_ser[i])+" "+str(vv_ser[i])+" "+str(ww_ser[i])+" "+str(peanokeys[i])+"\n")
+#
+#f.close()
+
+# close MS
+ms.close()
+
+# reporting timings
+readtime = readtime1-readtime0
+if rank == 0:
+   print("Read time:       ",readtime)
+   print("Process time:    ",tprocess)
diff --git a/scripts/noise.py b/scripts/noise.py
new file mode 100755
index 0000000000000000000000000000000000000000..dd9366d487765b0d9fdf0eb1e77102611db88639
--- /dev/null
+++ b/scripts/noise.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Noise operation for losito: adds Gaussian noise to a data column
+"""
+import os, argparse
+import pkg_resources
+import numpy as np
+from scipy.interpolate import interp1d
+from losito.lib_io import progress, logger
+from losito.lib_observation import MS
+
+logger.debug('Loading NOISE module.')
+
+
+def _run_parser(obs, parser, step):
+    column = parser.getstr(step, 'outputColumn', 'DATA')
+    factor = parser.getfloat(step, 'factor', 1.0)
+    parser.checkSpelling( step, ['outputColumn','factor'])
+    return run(obs, column, factor)
+
+def SEFD(ms, station1, station2, freq):
+    '''
+    Return the system equivalent flux density (SEFD) for all rows and a
+    single fequency channel.
+    The values for the SEFD were derived from van Haarlem
+    et al. (2013).
+
+    Parameters
+    ----------
+    ms : MS-object
+    station1 : (n,) ndarray, dtype = int
+        ANTENNA1 indices.
+    station2 : (n,) ndarray, dtpe = int
+        ANTENNA2 indices.
+    freq : float
+        Channel frequency in Hz.
+    Returns
+    -------
+    SEFD : (n,) ndarray
+        SEFD in Jansky.
+    '''
+
+    def interp_sefd(freq, antennamode):
+        # Load antennatype datapoints and interpolate, then reevaluate at freq.
+        sefd_pth = pkg_resources.resource_filename('losito', 'data/noise/SEFD_{}.csv'.format(antennamode))
+        points = np.loadtxt(sefd_pth, dtype=float, delimiter=',')
+        # Lin. extrapolation, so edge band noise is not very accurate.
+        fun = interp1d(points[:, 0], points[:, 1], fill_value='extrapolate',
+                        kind='linear')
+        return fun(freq)
+
+    if ms.antennatype in ['LBA_INNER', 'LBA_OUTER', 'LBA_ALL']:
+        SEFD = interp_sefd(freq, ms.antennatype)
+        return np.repeat(SEFD, len(station1))  # SEFD same for all BL
+    elif 'HBA' in ms.antennatype:
+        # For HBA, the SEFD differs between core and remote stations
+        names = np.array([_n[0:2] for _n in ms.stations])
+        CSids = ms.stationids[np.where(names =='CS')]
+        lim = np.max(CSids)  # this id separates the core/remote stations
+
+        # The SEFD for 1 BL is the sqrt of the products of the SEFD per station
+        SEFD_cs = interp_sefd(freq, 'HBA_CS')
+        SEFD_rs = interp_sefd(freq, 'HBA_RS')
+        SEFD_s1 = np.where(station1 <= lim, SEFD_cs, SEFD_rs)
+        SEFD_s2 = np.where(station2 <= lim, SEFD_cs, SEFD_rs)
+        return np.sqrt(SEFD_s1 * SEFD_s2)
+    else:
+        logger.error('Stationtype "{}" unknown.'.format(ms.stationtype))
+        return 1
+
+def add_noise_to_ms(ms, error, ntot, column='DATA', factor=1.0):
+    # TODO: ensure eta = 1 is appropriate
+    tab = ms.table(readonly=False)
+    eta = 0.95  # system efficiency. Roughly 1.0
+
+    chan_width = ms.channelwidth
+    freq = ms.get_frequencies()
+    ant1 = tab.getcol('ANTENNA1')
+    ant2 = tab.getcol('ANTENNA2')
+    exposure = ms.timepersample
+
+    # std = eta * SEFD(ms, ant1, ant2, freq) #TODO
+    # Iterate over frequency channels to save memory.
+    for i, nu in enumerate(freq):
+        # find correct standard deviation from SEFD
+        std = factor * eta * SEFD(ms, ant1, ant2, nu)
+        std /= np.sqrt(2 * exposure * chan_width[i])
+        # draw complex valued samples of shape (row, corr_pol)
+        noise = np.random.normal(loc=0, scale=std, size=[4, *np.shape(std)]).T
+        noise = noise + 1.j * np.random.normal(loc=0, scale=std, size=[4, *np.shape(std)]).T
+        noise = noise[:, np.newaxis, :]
+        noisereal = noise.real
+        noiseflat = noisereal.flatten()
+        ntot = ntot + noiseflat.size
+        noiseflat = noiseflat*noiseflat
+        error = error + noiseflat.sum()
+        noisereal = noise.imag
+        noiseflat = noisereal.flatten()
+        noiseflat = noiseflat*noiseflat
+        error = error + noiseflat.sum()
+
+        prediction = tab.getcolslice(column, blc=[i, -1], trc=[i, -1])
+        tab.putcolslice(column, prediction + noise, blc=[i, -1], trc=[i, -1])
+    tab.close()
+
+    return error,ntot
+
+def run(obs, column='DATA', factor=1.0):
+    """
+    Adds Gaussian noise to a data column. Scale of the noise, frequency-
+    and station-dependency are calculated according to 'Synthesis Imaging
+    in Radio Astronomy II' (1999) by Taylor et al., page 175.
+
+    Parameters
+    ----------
+    obs : Observation object
+        Input obs object.
+    column : str, optional. Default = DATA
+        Name of column to which noise is added
+    factor : float, optional. Default = 1.0
+        Scaling factor to change the noise level.
+    """
+    s = obs.scheduler
+    if s.qsub: # add noise in parallel on multiple nodes
+        for i, ms in enumerate(obs):
+            progress(i, len(obs), status='Estimating noise')  # progress bar
+            thisfile = os.path.abspath(__file__)
+            cmd = 'python {} --msin={} --start={} --end={} --column={} --factor={}'.format(thisfile,
+                                 ms.ms_filename, ms.starttime, ms.endtime, column, factor)
+            s.add(cmd, commandType='python', log='losito_add_noise', processors=1)
+        s.run(check=True)
+        return 0
+    else: # add noise linear on one cpu
+        results = []
+        error = 0
+        ntot = 0
+        for i, ms in enumerate(obs):
+            progress(i, len(obs), status='Estimating noise')  # progress bar
+            error, ntot = results.append(add_noise_to_ms(ms, error, ntot, column, factor))
+        return sum(results)
+
+if __name__ == '__main__':
+    # This file can also be executed directly for a single MS.
+    parser = argparse.ArgumentParser(description='Executable of the LoSiTo-noise operation')
+    parser.add_argument('--msin', help='MS file prefix', type=str)
+    parser.add_argument('--starttime', help='MJDs starttime', type=float)
+    parser.add_argument('--endtime', help='MJDs endtime', type=float)
+    parser.add_argument('--column', help='Column', default='DATA', type=str)
+    parser.add_argument('--factor', help='Factor', default=1.0, type=float)
+    # Parse parset
+    args = parser.parse_args()
+    ms = MS(args.msin, args.starttime, args.endtime)
+    error = 0
+    ntot = 0 
+    error, ntot = add_noise_to_ms(ms, error, ntot, args.column, args.factor)
+    print("STANDARD DEVIATION = ",np.sqrt(error/ntot), " Jy/beam")
diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c
index 18969cfc71f967b42895fff6bd2224a7f83c987d..70024c21be588c92273cdc3d47c17f689db920e3 100644
--- a/w-stacking-fftw.c
+++ b/w-stacking-fftw.c
@@ -131,6 +131,7 @@ int main(int argc, char * argv[])
 
 
 	// Initialize FITS image parameters
+<<<<<<< HEAD
 
 #ifdef FITSIO
 	fitsfile *fptreal;
@@ -138,6 +139,15 @@ int main(int argc, char * argv[])
 	int status;
 	char testfitsreal[FILENAMELENGTH] = "parallel_np8_real.fits";
 	char testfitsimag[FILENAMELENGTH] = "parallel_np8_img.fits";
+=======
+	#ifdef FITSIO
+	fitsfile *fptreal;
+	fitsfile *fptrimg;
+	int status;
+	char testfitsreal[FILENAMELENGTH] = "parallel_np2_real.fits";
+	char testfitsimag[FILENAMELENGTH] = "parallel_np2_img.fits";
+        #endif
+>>>>>>> main
 
 	long naxis = 2;
 	long naxes[2] = { grid_size_x, grid_size_y };
@@ -231,10 +241,7 @@ if(rank == 0){
 
         // INPUT FILES (only the first ndatasets entries are used)
 	int ndatasets = 1;
-        strcpy(datapath_multi[0],"/m100_scratch/userexternal/ederubei/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
-        //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/");
-        //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
-        //strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/");
+        strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/SKA3/ZW2_IFRQ_0-1-5.binMS/");
 
 	strcpy(datapath,datapath_multi[0]);
 	// Read metadata
@@ -259,8 +266,8 @@ if(rank == 0){
 	// WATCH THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 	int nsub = 1000;
 	//int nsub = 10;
-	printf("Subtracting last %d measurements\n",nsub);
-  Nmeasures = Nmeasures-nsub;
+	if(rank == 0)printf("Subtracting last %d measurements\n",nsub);
+        Nmeasures = Nmeasures-nsub;
 	Nvis = Nmeasures*freq_per_chan*polarisations;
 
   // calculate the coordinates of the center
@@ -317,7 +324,7 @@ if(rank == 0){
 	strcat(filename,vfile);
 	//printf("Reading %s\n",filename);
 
-  pFile = fopen (filename,"rb");
+        pFile = fopen (filename,"rb");
 	fseek (pFile,startrow*sizeof(double),SEEK_SET);
 	fread(vv,Nmeasures*sizeof(double),1,pFile);
 	fclose(pFile);
@@ -331,6 +338,61 @@ if(rank == 0){
 	fread(ww,Nmeasures*sizeof(double),1,pFile);
 	fclose(pFile);
 
+#ifdef NORMALIZE_UVW
+	double minu = 1e20;
+	double minv = 1e20;
+	double minw = 1e20;
+	double maxu = -1e20;
+	double maxv = -1e20;
+	double maxw = -1e20;
+	
+        for (long inorm=0; inorm<Nmeasures; inorm++)
+	    {
+               minu = MIN(minu,uu[inorm]);
+               minv = MIN(minv,vv[inorm]);
+               minw = MIN(minw,ww[inorm]);
+	       maxu = MAX(maxu,uu[inorm]);
+	       maxv = MAX(maxv,vv[inorm]);
+	       maxw = MAX(maxw,ww[inorm]);
+            }
+	double minu_all;
+	double minv_all;
+	double minw_all;
+	double maxu_all;
+	double maxv_all;
+	double maxw_all;
+	#ifdef USE_MPI
+	MPI_Allreduce(&minu,&minu_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+	MPI_Allreduce(&minv,&minv_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+	MPI_Allreduce(&minw,&minw_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+	MPI_Allreduce(&maxu,&maxu_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+	MPI_Allreduce(&maxv,&maxv_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+	MPI_Allreduce(&maxw,&maxw_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+        #else
+	minu_all = minu;
+	minv_all = minv;
+	minw_all = minw;
+	maxu_all = maxu;
+	maxv_all = maxv;
+	maxw_all = maxw;
+        #endif
+	double offset = 0.001;
+	double ming = MAX(abs(minu_all),abs(minv_all));
+	double maxg = MAX(abs(maxu_all),abs(maxv_all));
+	maxg = MAX(maxg,ming);
+	minw = minw_all;
+	maxw = maxw_all;
+        maxg = maxg+offset*maxg;
+        for (long inorm=0; inorm<Nmeasures; inorm++)
+	    {
+	       uu[inorm] = (uu[inorm]+maxg)/(2.0*maxg);
+	       vv[inorm] = (vv[inorm]+maxg)/(2.0*maxg);
+	       ww[inorm] = (ww[inorm]-minw)/(maxw-minw);
+            }
+
+#endif
+
+
 #ifdef USE_MPI
 	MPI_Barrier(MPI_COMM_WORLD);
 #endif
@@ -442,7 +504,7 @@ if(rank == 0){
 	for (int ifiles=0; ifiles<ndatasets; ifiles++)
 	{
 	strcpy(filename,datapath_multi[ifiles]);
-        printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets);
+        if(rank == 0)printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets);
 
         // Read metadata
         strcpy(filename,datapath);
@@ -466,7 +528,7 @@ if(rank == 0){
         resolution = 1.0/MAX(abs(uvmin),abs(uvmax));
         // calculate the resolution in arcsec
         double resolution_asec = (3600.0*180.0)/MAX(abs(uvmin),abs(uvmax))/PI;
-        printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);
+        if(rank == 0)printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);
 
         strcpy(filename,datapath);
         strcat(filename,weightsfile);
@@ -502,7 +564,7 @@ if(rank == 0){
   float * visreals;
   float * visimgs;
   float * weightss;
-	long isector;
+  long isector;
 
   for (long isector_count=0; isector_count<nsectors; isector_count++)
       {
@@ -524,15 +586,14 @@ if(rank == 0){
 
 	  // select data for this sector
         long icount = 0;
-	      long ip = 0;
-	      long inu = 0;
-//CLAAAA
-	       struct sectorlist * current;
-	       current = sectorhead[isector];
+	long ip = 0;
+	long inu = 0;
+	struct sectorlist * current;
+	current = sectorhead[isector];
 
 
 
-	       while (current->index != -1)
+	while (current->index != -1)
           {
              long ilocal = current->index;
       	     //double vvh = vv[ilocal];
@@ -604,8 +665,8 @@ if(rank == 0){
                dx,
                dw,
                w_support,
-	             xaxis,
-	             yaxis,
+	       xaxis,
+	       yaxis,
                gridss,
                num_threads);
 
diff --git a/w-stacking.cu b/w-stacking.cu
index 4fe36254789fff354e28fb887a5d4bb6c3b37704..cbad05c30e47fdb45ff8c86e424f2f00967c5522 100644
--- a/w-stacking.cu
+++ b/w-stacking.cu
@@ -14,12 +14,71 @@ double __device__
 #else
 double
 #endif
+// Gaussian Kernel
 gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
 {
      double conv_weight;
      conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
      return conv_weight;
 }
+
+void makeGaussKernel(double * kernel,
+		     int KernelLen,
+		     int increaseprecision,
+		     double std22)
+{
+
+  double norm = std22/PI;
+  int n = increaseprecision*KernelLen, mid = n / 2;
+  for (int i = 0; i != mid + 1; i++) {
+      double term = (double)i/(double)increaseprecision;
+      kernel[mid + i] = sqrt(norm) * exp(-(term*term)*std22);
+  }
+
+  for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i];
+//  for (int i = 0; i < n; i++) printf("%f\n",kernel[i]);
+
+}
+
+// Kaiser-Bessel Kernel: it is adapted from WSClean
+double bessel0(double x, double precision) {
+  // Calculate I_0 = SUM of m 0 -> inf [ (x/2)^(2m) ]
+  // This is the unnormalized bessel function of order 0.
+  double d = 0.0, ds = 1.0, sum = 1.0;
+  do {
+    d += 2.0;
+    ds *= x * x / (d * d);
+    sum += ds;
+  } while (ds > sum * precision);
+  return sum;
+}
+void makeKaiserBesselKernel(double * kernel,
+		            int KernelLen,
+			    int increaseprecision,
+                            double alpha,
+                            double overSamplingFactor,
+                            int withSinc) {
+  int n = increaseprecision*KernelLen, mid = n / 2;
+  double * sincKernel = malloc((mid + 1)*sizeof(*sincKernel));
+  const double filterRatio = 1.0 / overSamplingFactor;
+  sincKernel[0] = filterRatio;
+  for (int i = 1; i != mid + 1; i++) {
+    double x = i;
+    sincKernel[i] =
+        withSinc ? (sin(PI * filterRatio * x) / (PI * x)) : filterRatio;
+  }
+  const double normFactor = overSamplingFactor / bessel0(alpha, 1e-8);
+  for (int i = 0; i != mid + 1; i++) {
+    double term = (double)i / mid;
+    kernel[mid + i] = sincKernel[i] *
+                bessel0(alpha * sqrt(1.0 - (term * term)), 1e-8) *
+                normFactor;
+  }
+  for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i];
+  //for (int i = 0; i < n; i++) printf("%f\n",kernel[i]);
+}
+
+
 #ifdef ACCOMP
 #pragma omp end declare target
 #endif
@@ -146,10 +205,22 @@ void wstack(
     // initialize the convolution kernel
     // gaussian:
     int KernelLen = (w_support-1)/2;
+    int increaseprecision = 5; // this number must be odd: increaseprecison*w_support must be odd (w_support must be odd)
     double std = 1.0;
     double std22 = 1.0/(2.0*std*std);
     double norm = std22/PI;
-
+    double * convkernel = malloc(increaseprecision*w_support*sizeof(*convkernel));
+    double overSamplingFactor = 1.0;
+    int withSinc = 0;
+    double alpha = 8.6;
+    #ifdef GAUSS
+    makeGaussKernel(convkernel,w_support,increaseprecision,std22);
+    #endif
+    #ifdef KAISERBESSEL
+    makeKaiserBesselKernel(convkernel, w_support, increaseprecision, alpha, overSamplingFactor, withSinc);
+    #endif
+
+    
     // Loop over visibilities.
 // Switch between CUDA and GPU versions
 #ifdef __CUDACC__
@@ -266,15 +337,29 @@ void wstack(
         for (k = kmin; k <= kmax; k++)
         {
 
-            double v_dist = (double)k+0.5 - pos_v;
+            //double v_dist = (double)k+0.5 - pos_v;
+            double v_dist = (double)k - pos_v;
 
             for (j = jmin; j <= jmax; j++)
             {
-                double u_dist = (double)j+0.5 - pos_u;
+                //double u_dist = (double)j+0.5 - pos_u;
+                double u_dist = (double)j - pos_u;
 		long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+                int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
+                int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));
 
-
+                #ifdef GAUSS_HI_PRECISION 
 		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
+                #endif
+                #ifdef GAUSS
+		double conv_weight = convkernel[jKer]*convkernel[kKer];
+		//if(jKer < 0 || jKer >= 35 || kKer < 0 || kKer >= 35)
+		//	printf("%f %d %f %d\n",fabs(u_dist+(double)KernelLen),jKer,fabs(v_dist+(double)KernelLen),kKer);
+		//printf("%d %d %d %d %f %f %f %f %f\n",jKer, j, kKer, k, pos_u, pos_v, u_dist,v_dist,conv_weight);
+                #endif
+                #ifdef KAISERBESSEL
+		double conv_weight = convkernel[jKer]*convkernel[kKer];
+	        #endif	
 		// Loops over frequencies and polarizations
 		double add_term_real = 0.0;
 		double add_term_img = 0.0;