diff --git a/Build/Makefile.M100 b/Build/Makefile.M100
index 1fa008e8cf29ab9f112d3ad399a6fe4dd5faa901..eaf92e21428d06d38ce46146f1cee1677a972341 100644
--- a/Build/Makefile.M100
+++ b/Build/Makefile.M100
@@ -1,3 +1,4 @@
+
 CC       =  gcc
 CXX      =  g++
 
@@ -5,8 +6,11 @@ MPICC    =  mpicc
 MPIC++   =  mpiCC
 
 
-#FFTW_INCL=  -I/home/taffoni/sw/include
-#FFTW_LIB=  -L/home/taffoni/sw/lib
+FFTW_INCL=  -I/cineca/prod/opt/libraries/fftw/3.3.8/spectrum_mpi--10.4.0--binary/include/
+FFTW_LIB=  /cineca/prod/opt/libraries/fftw/3.3.8/spectrum_mpi--10.4.0--binary/lib
+
+FITSIO_INCL= -I/m100_work/IscrC_CD-DLS/cfitsio-3.49
+FITSIO_LIB= /m100_work/IscrC_CD-DLS/cfitsio-3.49
 
 
 NVCC = nvcc
@@ -15,6 +19,6 @@ NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
 
 OMP= -fopenmp
 
-CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
+CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) $(FITSIO_INCL)
 
 OPTIMIZE = $(OMP) -O3 -mtune=native
diff --git a/Makefile b/Makefile
index 23d161d0f6b432626329c9ffa52f94d2e31fab2a..f632e6ed188e180fe79f0ccdb42d037fe29fb751 100644
--- a/Makefile
+++ b/Makefile
@@ -27,14 +27,21 @@ endif
 
 #OPT += -DNVIDIA
 # perform one-side communication (suggested) instead of reduce (only if MPI is active)
-OPT += -DONE_SIDE
+#OPT += -DONE_SIDE
 # write the full 3D cube of gridded visibilities and its FFT transform
 #OPT += -DWRITE_DATA
 # write the final image
 OPT += -DWRITE_IMAGE
 # perform w-stacking phase correction
 OPT += -DPHASE_ON
+# Support CFITSIO
+OPT += -DFITSIO
+# Perform true parallel images writing
+#OPT += -DPARALLELIO
 
+ifeq (FITSIO,$(findstring FITSIO,$(OPT)))
+	LIBS += -L$(FITSIO_LIB) -lcfitsio
+endif
 
 DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu
 COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o
diff --git a/scripts/plotgrid_fits.py b/scripts/plotgrid_fits.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b4583ea6d8c48febb643e62dcbcce294a1b0f46
--- /dev/null
+++ b/scripts/plotgrid_fits.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python3
+import numpy as np
+import matplotlib.pyplot as plt
+from astropy.io import fits
+real_fits = "/home/emanuele/hpc_imaging/test_fits_real.fits"
+imag_fits = "/home/emanuele/hpc_imaging/test_fits_img.fits"
+nplanes = 1
+
+
+with fits.open(real_fits) as hdu_real:
+	img_hdu_real = np.array(hdu_real[0].data)
+
+with fits.open(imag_fits) as hdu_imag:
+	img_hdu_imag = np.array(hdu_imag[0].data)
+
+
+xaxis = int(np.sqrt(img_hdu_real.size))
+yaxes = xaxis
+residual = np.vectorize(complex)(img_hdu_real, img_hdu_imag)
+
+cumul2d = residual.reshape((xaxis,yaxes,nplanes), order='F')
+
+for i in range(nplanes):
+    gridded = np.squeeze(cumul2d[:,:,i])
+    ax = plt.subplot()
+    img = ax.imshow(np.abs(np.fft.fftshift(gridded)), aspect='auto', interpolation='none', origin='lower')
+    ax.set_xlabel('cell')
+    ax.set_ylabel('cell')
+    cbar = plt.colorbar(img)
+    cbar.set_label('norm(FFT)',size=18)
+    figname='fits_image_' + str(i) + '.png'
+    plt.savefig(figname)
diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c
index 5469014518e9e03a3d65ee9461bccb9c58cebfc1..55244baa73b6378fde3c111568963353bc33ee24 100644
--- a/w-stacking-fftw.c
+++ b/w-stacking-fftw.c
@@ -1,6 +1,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#ifdef FITSIO
+#include "fitsio.h"
+#endif
 #ifdef USE_MPI
 #include <mpi.h>
 #ifdef USE_FFTW
@@ -44,6 +47,7 @@ int main(int argc, char * argv[])
 	int rank;
 	int size;
 
+
         // Define main filenames
 	FILE * pFile;
 	FILE * pFile1;
@@ -51,7 +55,7 @@ int main(int argc, char * argv[])
 	FILE * pFileimg;
 	// Global filename to be composed
 	char filename[1000];
-	
+
         // MS paths
         char datapath[900];
         char datapath_multi[NFILES][900];
@@ -123,6 +127,19 @@ int main(int argc, char * argv[])
 	// Half support size
 	double w_supporth = (double)((w_support-1)/2)*dx;
 
+
+	// Initialize FITS image parameters
+	fitsfile *fptreal;
+	fitsfile *fptrimg;
+	int status;
+	char testfitsreal[FILENAMELENGTH] = "parallel_np2_real.fits";
+	char testfitsimag[FILENAMELENGTH] = "parallel_np2_img.fits";
+
+	long naxis = 2;
+	long naxes[2] = { grid_size_x, grid_size_y };
+
+
+
 	// Internal profiling parameters 
 	clock_t start, end, start0, startk, endk;
 	double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time;
@@ -195,15 +212,14 @@ if(rank == 0){
 	// LOCAL grid size
 	xaxis = local_grid_size_x;
 	yaxis = local_grid_size_y;
-
 	clock_gettime(CLOCK_MONOTONIC, &begin);
 	start = clock();
 
         // INPUT FILES (only the first ndatasets entries are used)
 	int ndatasets = 1;
-        //strcpy(datapath_multi[0],"data/newgauss2noconj_t201806301100_SBL180.binMS/");
+        strcpy(datapath_multi[0],"/m100_scratch/userexternal/ederubei/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
         //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/");
-        strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
+        //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
         //strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/");
 
 	strcpy(datapath,datapath_multi[0]);
@@ -917,8 +933,8 @@ if(rank == 0){
         clock_gettime(CLOCK_MONOTONIC, &begin);
         start = clock();
 	if(rank == 0)printf("PHASE CORRECTION\n");
-        double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));	
-        double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));	
+        double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));
+        double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));
 
         phase_correction(gridss,image_real,image_imag,xaxis,yaxis,num_w_planes,grid_size_x,grid_size_y,resolution,wmin,wmax,num_threads);
 
@@ -931,23 +947,106 @@ if(rank == 0){
 
         if(rank == 0)
         {
-            pFilereal = fopen (fftfile2,"wb");
-            pFileimg = fopen (fftfile3,"wb");
-            fclose(pFilereal);
-            fclose(pFileimg);
+            #ifdef FITSIO
+            printf("REMOVING RESIDUAL FITS FILE\n");
+            remove(testfitsreal);
+            remove(testfitsimag);
+
+
+	    printf("FITS CREATION\n");
+            status = 0;
+
+            fits_create_file(&fptrimg, testfitsimag, &status);
+            fits_create_img(fptrimg, DOUBLE_IMG, naxis, naxes, &status);
+            fits_close_file(fptrimg, &status);
+
+	    status = 0;
+
+            fits_create_file(&fptreal, testfitsreal, &status);
+	    fits_create_img(fptreal, DOUBLE_IMG, naxis, naxes, &status);
+	    fits_close_file(fptreal, &status);
+	    #endif
+
+            pFilereal = fopen (fftfile2, "wb");
+	    pFileimg = fopen (fftfile3, "wb");
+	    fclose(pFilereal);
+	    fclose(pFileimg);
         }
 	#ifdef USE_MPI
 	MPI_Barrier(MPI_COMM_WORLD);
         #endif
         if(rank == 0)printf("WRITING IMAGE\n");
+        long * fpixel = (long *) malloc(sizeof(long)*naxis);
+        long * lpixel = (long *) malloc(sizeof(long)*naxis);
+
+        #ifdef USE_MPI
+        MPI_Barrier(MPI_COMM_WORLD);
+        #endif
+	#ifdef PARALLELIO
+        #ifdef FITSIO
+
+        fpixel[0] = 1;
+        fpixel[1] = rank*yaxis+1;
+        lpixel[0] = xaxis;
+        lpixel[1] = (rank+1)*yaxis;
+
+        status = 0;
+        fits_open_image(&fptreal, testfitsreal, READWRITE, &status);
+        fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status);
+        fits_close_file(fptreal, &status);
+
+        status = 0;
+        fits_open_image(&fptrimg, testfitsimag, READWRITE, &status);
+        fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status);
+        fits_close_file(fptrimg, &status);
+
+        #endif //FITSIO
+
+        pFilereal = fopen (fftfile2,"ab");
+        pFileimg = fopen (fftfile3,"ab");
+
+	long global_index = rank*(xaxis*yaxis)*sizeof(double);
+
+        fseek(pFilereal, global_index, SEEK_SET);
+        fwrite(image_real, xaxis*yaxis, sizeof(double), pFilereal);
+        fseek(pFileimg, global_index, SEEK_SET);
+        fwrite(image_imag, xaxis*yaxis, sizeof(double), pFileimg);
+
+        fclose(pFilereal);
+        fclose(pFileimg);
+         
+	#ifdef USE_MPI
+	MPI_Barrier(MPI_COMM_WORLD);
+        #endif
+        #else
 	for (int isector=0; isector<size; isector++)
 	{
 	    #ifdef USE_MPI
 	    MPI_Barrier(MPI_COMM_WORLD);
             #endif
 	    if(isector == rank)
-            {
+	    {
+               #ifdef FITSIO
+
 	       printf("%d writing\n",isector);
+               
+               fpixel[0] = 1;
+               fpixel[1] = isector*yaxis+1;
+               lpixel[0] = xaxis;
+               lpixel[1] = (isector+1)*yaxis;
+               
+               status = 0;
+	       fits_open_image(&fptreal, testfitsreal, READWRITE, &status);
+               fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status);
+	       fits_close_file(fptreal, &status);
+
+               status = 0;
+               fits_open_image(&fptrimg, testfitsimag, READWRITE, &status);
+               fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status);
+               fits_close_file(fptrimg, &status);
+
+               #endif //FITSIO
+
                pFilereal = fopen (fftfile2,"ab");
                pFileimg = fopen (fftfile3,"ab");
 
@@ -962,10 +1061,12 @@ if(rank == 0){
                fclose(pFileimg);
 	    }
 	}
+	#endif //PARALLELIO
 	#ifdef USE_MPI
 	MPI_Barrier(MPI_COMM_WORLD);
         #endif
 
+
 #endif //WRITE_IMAGE