/* ///////////////////////////////////////////////////////////////////// */
/* Authors:  A. Mignone (mignone@to.infn.it)                             */
/*           V. Cesare  (valentina.cesare@inaf.it)                       */
/*           D. Goz     (david.goz@inaf.it)                              */
/*                                                                       */
/* Date   : June 2024                                                    */
/*                                                                       */
/* ///////////////////////////////////////////////////////////////////// */

#include "allvars.h"
#include "tools.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>

static int NX_GLOB, NY_GLOB;

typedef struct MyGrid
{
  int local_start[NDIM];  /* Local start index in each dimension  */   
  int local_end[NDIM];    /* Local end index in each dimension    */ 
  int global_start[NDIM]; /* Global start index in each dimension */
  int global_end[NDIM];   /* Global end index in each dimension   */
  int dim[NDIM];          /* Local domain size (no ghosts)        */
} myDomain;


typedef struct Task_2D_Cartesian
{
  int rank;              /* Local process rank                            */
  int nranks;            /* Communicator size                             */
  int coords[NDIM];      /* Cartesian topology coordinate                 */
  myDomain domain;       /* MyGrid structure (defined above)              */
  int nbrtop;            /* Top neighbor process in cartesian topology    */
  int nbrbottom;         /* Bottom neighbor process in cartesian topology */
  int nbrleft;           /* Left neighbor process in cartesian topology   */
  int nbrright;          /* Right neighbor process in cartesian topology  */
  MPI_Comm comm2d;       /* Cartesian communicator                        */
} Task;

/* function prototypes */
void BoundaryConditions(MyData **const restrict,
			MyData  *const restrict,
			MyData  *const restrict,
			Task    *const restrict);

void JacobiAlgorithm(MyData **const restrict, MyData **const restrict, const MyData *restrict,
		     const int, const int, const int, const int, MyData *const restrict);

void Jacobi_Communication(MyData      **const restrict Phi,
			  MyData      **const restrict Phi0,
			  MyData   *const restrict error,
			  const MyData *      restrict delta,
			  Task         *const restrict ThisTask);

/* void WriteSolution(MyData **const phi, const int nx, const int ny); */
void WriteSolutionParallel(MyData **const phi, Task *const ThisTask);

int main(int argc, char **argv)
{
  int rank, Nranks;
  const MPI_Comm comm = MPI_COMM_WORLD;
  MPI_Init(&argc, &argv);
  MPI_Comm_rank(comm, &rank);
  MPI_Comm_size(comm, &Nranks);

  if ((argc <= 2) && (!rank))
    {
      printf("\n\t Usage: <executable> <grid_size> <cartesian communicator size_X>\n\n");
      MPI_Abort(comm, EXIT_FAILURE);
      exit(EXIT_FAILURE);
    }

  /* global X and Y grid size (square matrix supposed) */
  NX_GLOB = (int) strtol(argv[1], NULL, 10);
  NY_GLOB = NX_GLOB;

  /********************************** Cartesin topology *******************************************************/
  const int cartesian_grid_x = (int)strtol(argv[2], NULL, 10);
  const int cartesian_grid_y = ((Nranks % cartesian_grid_x == 0) ? (Nranks / cartesian_grid_x) : -1);
  if (cartesian_grid_y == -1)
    {
      if (!rank)
        {
          printf("\n\t Nranks mod cartesian_grid_x != 0 ... aborting ...\n");
          MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
          exit(EXIT_FAILURE);
        }
    }
  
  const int dims[NDIM] = {cartesian_grid_x, cartesian_grid_y};
  const int periods[NDIM] = {0 ,0}; /* not periodic */
  const int reorder = 0;            /* setting the logical value reorder to TRUE allows MPI    */
                                    /* to reorder the rank of the processes figuring out the   */
                                    /* neighbors in the actual hardware for better performance */

  /* make a new communicator to which topology information has been attached */
  Task ThisTask;
  ThisTask.comm2d = MPI_COMM_NULL;
  MPI_Cart_create(MPI_COMM_WORLD, NDIM, dims, periods, reorder, &ThisTask.comm2d);
  if (ThisTask.comm2d == MPI_COMM_NULL)
    {
      printf("\n\t Process %d is not part of the new communicator \n", rank);
      fflush(stdout);
      MPI_Abort(comm, EXIT_FAILURE);
      exit(EXIT_FAILURE);
    }

  /* get the comm size */
  MPI_Comm_size(ThisTask.comm2d, &ThisTask.nranks);
  
  /* determine the process coords in cartesian topology given rank in group */
  MPI_Cart_coords(ThisTask.comm2d, rank, NDIM, ThisTask.coords);

  /* determines process rank in communicator given Cartesian location */
  MPI_Cart_rank(ThisTask.comm2d, ThisTask.coords, &ThisTask.rank);
  
  /* get bottom and top neighbors (X direction) */
  MPI_Cart_shift(ThisTask.comm2d, X, 1, &ThisTask.nbrbottom, &ThisTask.nbrtop);

  /* get left and right neighbors (Y direction) */
  MPI_Cart_shift(ThisTask.comm2d, Y, 1, &ThisTask.nbrleft, &ThisTask.nbrright);
  
  /************************************************************************************************************/
  /* 2D MPI-cartesian decomposition:
     the grids are distributed across the MPI processes. */

  /* get the reminder, i.e. take into account uneven
     decomposition of points among the processes    */
  const int rem[NDIM] = {(NX_GLOB + 2) % dims[X],
                         (NY_GLOB + 2) % dims[Y]};

  /* get the amount of data for each MPI process:
     - chunk is supposed to be >= 1 */
  const int chunk[NDIM] = {(NX_GLOB + 2 - rem[X]) / dims[X],
                           (NY_GLOB + 2 - rem[Y]) / dims[Y]};
  if ((chunk[X] < 1) || (chunk[Y] < 1))
    {
      printf("\n\t chunk[X] < 1 || chunk[Y] < 1 ... aborting ...[rank: %d]\n", rank);
      MPI_Comm_free(&ThisTask.comm2d);
      MPI_Abort(comm, EXIT_FAILURE);
      exit(EXIT_FAILURE);
    }
  
  /* get the subgrid dimension along X and Y directions: */
  int incr[NDIM], offset[NDIM];
  for (int dim=0 ; dim<NDIM ; dim++)
    {
      if (ThisTask.coords[dim] < rem[dim])
	{
	  incr[dim] = chunk[dim] + 1;
	  offset[dim] = 0;
	}
      else
	{
	  incr[dim] = chunk[dim];
	  offset[dim] = rem[dim];
	}
    }

  /* subdomain managed by the task */
  for (int dim=0 ; dim<NDIM ; dim++)
    {
      ThisTask.domain.global_start[dim] = ((ThisTask.coords[dim] * incr[dim]) + offset[dim]);
      ThisTask.domain.global_end[dim]   = (ThisTask.domain.global_start[dim] + incr[dim]) - 1;

      /* boundaries */
      ThisTask.domain.global_start[dim] = ((ThisTask.domain.global_start[dim] == 0)         ? NGHOST  : ThisTask.domain.global_start[dim]);
      ThisTask.domain.global_end[dim]   = ((ThisTask.domain.global_end[dim] == NX_GLOB + 1) ? NX_GLOB : ThisTask.domain.global_end[dim]);

      ThisTask.domain.dim[X] = (ThisTask.domain.global_end[X] - ThisTask.domain.global_start[X] + 1);
      ThisTask.domain.dim[Y] = (ThisTask.domain.global_end[Y] - ThisTask.domain.global_start[Y] + 1);

      /* local index */
      ThisTask.domain.local_start[X] = ThisTask.domain.local_start[Y] = 1;
      ThisTask.domain.local_end[X]   = ThisTask.domain.dim[X];
      ThisTask.domain.local_end[Y]   = ThisTask.domain.dim[Y];
    }

#if defined(DEBUG)
  for (int task=0 ; task<ThisTask.nranks ; task++)
    {
      if (ThisTask.rank == task)
	{
	  printf("\n\t rank: %d", task);
	  printf("\n\t\t coords = [%d, %d]", ThisTask.coords[X], ThisTask.coords[Y]);
	  printf("\n\t\t domain.global_start[X] = %d - domain.global_end[X] = %d", ThisTask.domain.global_start[X], ThisTask.domain.global_end[X]);
	  printf("\n\t\t domain.global_start[Y] = %d - domain.global_end[Y] = %d", ThisTask.domain.global_start[Y], ThisTask.domain.global_end[Y]);
	  printf("\n\t\t domain.local_start[X]  = %d - domain.local_end[X]  = %d", ThisTask.domain.local_start[X],  ThisTask.domain.local_end[X]);
	  printf("\n\t\t domain.local_start[Y]  = %d - domain.local_end[Y]  = %d", ThisTask.domain.local_start[Y],  ThisTask.domain.local_end[Y]);
	  printf("\n\t\t domain.dim[X]          = %d - domain.dim[Y] = %d", ThisTask.domain.dim[X], ThisTask.domain.dim[Y]);
	  printf("\n\t\t nbrtop  = %d - nbrbottom = %d",  ThisTask.nbrtop,  ThisTask.nbrbottom);
	  printf("\n\t\t nbrleft = %d - nbrright  = %d\n", ThisTask.nbrleft, ThisTask.nbrright);
	  fflush(stdout);
	}
      MPI_Barrier(ThisTask.comm2d);
    }
#endif /* DEBUG */
  
  /******************************************************************************************************/
  
  const MyData xbeg = 0.0;
  const MyData xend = 1.0;
  const MyData ybeg = 0.0;
  const MyData yend = 1.0;

  const MyData delta[NDIM] = {(xend - xbeg)/(NX_GLOB + 1),
			      (yend - ybeg)/(NY_GLOB + 1)};

  /* --------------------------------------------------------
     1. Set grid indices
     -------------------------------------------------------- */
    
  const int ibeg = NGHOST;
  const int jbeg = NGHOST;
  
  /* --------------------------------------------------------
     2. Generate grids, allocate memory
        distributed across MPI processes
     -------------------------------------------------------- */

  /* memory allocation */
  MyData *xg = (MyData *) malloc((NX_GLOB + 2*NGHOST) * sizeof(MyData));
  MyData *yg = (MyData *) malloc((NY_GLOB + 2*NGHOST) * sizeof(MyData));
  assert((xg != NULL) && (yg != NULL));

  /* initial conditions */
  for (int i=0 ; i<(NX_GLOB + 2*NGHOST) ; i++) xg[i] = xbeg + (i - ibeg + 1) * delta[X];
  for (int j=0 ; j<(NY_GLOB + 2*NGHOST) ; j++) yg[j] = ybeg + (j - jbeg + 1) * delta[Y];

  /* grids memory allocation distributed across MPI processes */
  MyData **phi  = Allocate_2DdblArray(ThisTask.domain.dim[X] + 2, ThisTask.domain.dim[Y] + 2);
  MyData **phi0 = Allocate_2DdblArray(ThisTask.domain.dim[X] + 2, ThisTask.domain.dim[Y] + 2);

  /* --------------------------------------------------------
     3. Set boundary conditions
     -------------------------------------------------------- */
  BoundaryConditions(phi0, xg, yg, &ThisTask);
  BoundaryConditions(phi,  xg, yg, &ThisTask);
  free(yg);
  free(xg);
  
  /* --------------------------------------------------------
     4. Main iteration cycle
     -------------------------------------------------------- */

  const double time_start = MPI_Wtime();
  
  /* iterations */
  int k = 0;
  while (1)
    {
      /* -- 4a. Jacobi algorithm overlapping computation and communication */
      MyData err;
      Jacobi_Communication(phi, phi0, &err, delta, &ThisTask);

      /* -- 4b. Get the total error                                                                 */
      /*        combines values from all processes and distributes the result back to all processes */
      MyData toterr;
      MPI_Allreduce(&err, &toterr, 1, MPI_MyDatatype, MPI_SUM, ThisTask.comm2d);
      
      if (ThisTask.rank == MASTERTASK)
	printf("\n\t Iteration = %d - err = %lg\n",k, toterr);

      /* increase the counter of loop iterations */
      k++;
      
      /* check convergence */
      if (toterr <= TOL)
	{
	  break;
	}

      /* swap the pointers */
      MyData **tmp = phi;
      phi = phi0;
      phi0 = tmp;
    }

  /* ranks write the solution in parallel */
  WriteSolutionParallel(phi, &ThisTask);

  if (ThisTask.rank == MASTERTASK)
    {      
      printf("\n\t NX_GLOB x NY_GLOB = %d x %d\n", NX_GLOB, NY_GLOB);
      printf("\n\t Time = %lf [s]\n\n", MPI_Wtime() - time_start);
    }

  // free memory
  if (phi0)
    {
      free(phi0[0]);
      free(phi0);
    }

  if (phi)
    {
      free(phi[0]);
      free(phi);
    }

  MPI_Comm_free(&ThisTask.comm2d);
  
  MPI_Finalize();
  
  return 0;
}

/* ********************************************************************* */
void BoundaryConditions(MyData **const restrict phi,
			MyData  *const restrict x,
			MyData  *const restrict y,
			Task    *const restrict ThisTask)
/*************************************************************************/
{
  /* left */
  if (ThisTask->nbrleft == MPI_PROC_NULL) /* no left neighbor */
    {
      const int global_start_j = ThisTask->domain.global_start[X];
      for (int j=ThisTask->domain.local_start[X] ; j<=ThisTask->domain.local_end[X] ; j++)
	phi[j][0] = (1.0 - y[global_start_j + j - 1]);
    }

  /* right */
  if (ThisTask->nbrright == MPI_PROC_NULL) /* no right neighbor */
    {
      const int global_start_j = ThisTask->domain.global_start[X];
      for (int j=ThisTask->domain.local_start[X] ; j<=ThisTask->domain.local_end[X] ; j++)
	phi[j][ThisTask->domain.local_end[Y] + 1] = (y[global_start_j + j - 1] * y[global_start_j + j - 1]);
    }
  
  /* bottom */
  if (ThisTask->nbrbottom == MPI_PROC_NULL) /* no bottom neighbor */
    {
      const int global_start_i = ThisTask->domain.global_start[Y];
      for (int i=ThisTask->domain.local_start[Y] ; i<=ThisTask->domain.local_end[Y] ; i++)
	phi[0][i] = (1.0 - x[global_start_i + i - 1]);
    }
  
  /* top */
  if (ThisTask->nbrtop == MPI_PROC_NULL) /* no top neighbor */
    {
      const int global_start_i = ThisTask->domain.global_start[Y];
      for (int i=ThisTask->domain.local_start[Y] ; i<=ThisTask->domain.local_end[Y] ; i++)
	phi[ThisTask->domain.local_end[X] + 1][i] = x[global_start_i + i - 1];
    }

#if defined(DEBUG)

  if (ThisTask->rank == MASTERTASK)
    {
      printf("\n");
      for (int i=0 ; i<NX_GLOB+2*NGHOST ; i++)
	printf("\n\t x[%d] = %8.2f", i, x[i]);
  
      printf("\n");
      for (int i=0 ; i<NX_GLOB+2*NGHOST ; i++)
	printf("\n\t y[%d] = %8.2f", i, y[i]);

      printf("\n");
      fflush(stdout);
    }

  MPI_Barrier(ThisTask->comm2d);

  for (int task=0 ; task<ThisTask->nranks ; task++)
    {
      if (task == ThisTask->rank)
	{
	  char string[128];
	  sprintf(string, "Task %d - Phi", task);
	  
	  Show_2DdblArray(phi, ThisTask->domain.dim[X] + 2, ThisTask->domain.dim[Y] + 2, string);
	}
      
      MPI_Barrier(ThisTask->comm2d);
    }
  
#endif /* DEBUG */  
  
  return;
}

/* ********************************************************************* */

void JacobiAlgorithm(MyData **const restrict Phi,
		     MyData **const restrict Phi0,
		     const MyData  *restrict delta,
                     const int               jbeg,
		     const int               jend,
		     const int               ibeg,
		     const int               iend,
		     MyData *const restrict  error)
{
  for (int j=jbeg ; j<=jend ; j++)
    {
      for (int i=ibeg ; i<=iend ; i++)
	{
	  Phi[j][i] = 0.25 * (Phi0[j][i-1] + Phi0[j][i+1] +
			      Phi0[j-1][i] + Phi0[j+1][i]);

	  *error += (delta[X] * delta[Y] * fabs(Phi[j][i] - Phi0[j][i]));
	} /* loop over columns */
    } /* loop over rows */

  return;
}

void get_error(MyData **const restrict Phi,
	       MyData **const restrict Phi0,
	       const MyData  *restrict delta,
	       const int               jbeg,
	       const int               jend,
	       const int               ibeg,
	       const int               iend,
	       MyData *const restrict  error)
{
  *error = 0.0;
  for (int j=jbeg ; j<=jend ; j++)
    {
      for (int i=ibeg ; i<=iend ; i++)
	{
	  *error += (delta[X] * delta[Y] * fabs(Phi[j][i] - Phi0[j][i]));
	} /* loop over columns */
    } /* loop over rows */
    
  return;
}

void Jacobi_Communication(MyData      **const restrict Phi,
			  MyData      **const restrict Phi0,
			  MyData       *const restrict error,
			  const MyData *      restrict delta,
			  Task         *const restrict ThisTask)
{
  /* custom datatype */
  MPI_Datatype column;
  MPI_Type_vector(ThisTask->domain.dim[X], 1, ThisTask->domain.dim[Y] + 2, MPI_MyDatatype, &column);
  /* commits the datatype */
  MPI_Type_commit(&column);

  const int data_row_size = ThisTask->domain.dim[Y];
  
  /* First task: issue the communication */

  MyData **const restrict buffer = Phi0;

#if MPI_VERSION > 4  

  MPI_Request request[4];

  MPI_Isendrecv(&buffer[ThisTask->domain.local_end[X]      ][ThisTask->domain.local_start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrtop,    0,
  		&buffer[ThisTask->domain.local_start[X] - 1][ThisTask->domain.local_start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrbottom, 0,
  		ThisTask->comm2d, &request[0]);

  MPI_Isendrecv(&buffer[ThisTask->domain.local_start[X]    ][ThisTask->domain.local_start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrbottom, 1,
  		&buffer[ThisTask->domain.local_end[X]   + 1][ThisTask->domain.local_start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrtop,    1,
  		ThisTask->comm2d, &request[1]);

  MPI_Isendrecv(&buffer[ThisTask->domain.local_start[X]    ][ThisTask->domain.local_end[Y]      ], 1,             column,         ThisTask->nbrright,  2,
  		&buffer[ThisTask->domain.local_start[X]    ][ThisTask->domain.local_start[Y] - 1], 1,             column,         ThisTask->nbrleft,   2,
  		ThisTask->comm2d, &request[2]);
  
  MPI_Isendrecv(&buffer[ThisTask->domain.local_start[X]    ][ThisTask->domain.local_start[Y]    ], 1,             column,         ThisTask->nbrleft,   3,
  		&buffer[ThisTask->domain.local_start[X]    ][ThisTask->domain.local_end[Y]   + 1], 1,             column,         ThisTask->nbrright,  3,
  		ThisTask->comm2d, &request[3]);

#else
  
  MPI_Request request[8];

  MPI_Irecv(&buffer[ThisTask->domain.local_start[X] - 1][ThisTask->domain.local_start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrbottom, 0, ThisTask->comm2d, &request[0]);
  MPI_Irecv(&buffer[ThisTask->domain.local_end[X]   + 1][ThisTask->domain.local_start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrtop   , 1, ThisTask->comm2d, &request[1]);
  MPI_Irecv(&buffer[ThisTask->domain.local_start[X]    ][ThisTask->domain.local_start[Y] - 1], 1,             column,         ThisTask->nbrleft,   2, ThisTask->comm2d, &request[2]);
  MPI_Irecv(&buffer[ThisTask->domain.local_start[X]    ][ThisTask->domain.local_end[Y]   + 1], 1,             column,         ThisTask->nbrright,  3, ThisTask->comm2d, &request[3]);
  
  MPI_Isend(&buffer[ThisTask->domain.local_end[X]      ][ThisTask->domain.local_start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrtop,    0, ThisTask->comm2d, &request[4]);
  MPI_Isend(&buffer[ThisTask->domain.local_start[X]    ][ThisTask->domain.local_start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrbottom, 1, ThisTask->comm2d, &request[5]);
  MPI_Isend(&buffer[ThisTask->domain.local_start[X]    ][ThisTask->domain.local_end[Y]      ], 1,             column,         ThisTask->nbrright,  2, ThisTask->comm2d, &request[6]);
  MPI_Isend(&buffer[ThisTask->domain.local_start[X]    ][ThisTask->domain.local_start[Y]    ], 1,             column,         ThisTask->nbrleft,   3, ThisTask->comm2d, &request[7]);

#endif

  /**************************************** computation ****************************************/
  /* perform the computation with the local data, (i.e. ghost cells are not required) */
  /* so overlapping computation and communication */

  const int jbeg = ThisTask->domain.local_start[X] + 1;
  const int jend = ThisTask->domain.local_end[X]   - 1;
  const int ibeg = ThisTask->domain.local_start[Y] + 1;
  const int iend = ThisTask->domain.local_end[Y]   - 1;
  
  *error = 0.0;
  JacobiAlgorithm(Phi, Phi0, delta, jbeg, jend, ibeg, iend, error);

  /*********************************************************************************************/

#if MPI_VERSION > 4  
  /* wait the data on the boundaries */
  MPI_Waitall(4, request, MPI_STATUSES_IGNORE);
#else
  MPI_Waitall(8, request, MPI_STATUSES_IGNORE);
#endif
  
  /*  nbrbottom */
  JacobiAlgorithm(Phi, Phi0, delta,
		  ThisTask->domain.local_start[X], ThisTask->domain.local_start[X],
		  ThisTask->domain.local_start[Y], ThisTask->domain.local_end[Y],
		  error);

  /* nbrtop */
  JacobiAlgorithm(Phi, Phi0, delta,
		  ThisTask->domain.local_end[X],   ThisTask->domain.local_end[X],
		  ThisTask->domain.local_start[Y], ThisTask->domain.local_end[Y],
		  error);

  /* nbrleft */
  JacobiAlgorithm(Phi, Phi0, delta,
		  ThisTask->domain.local_start[X], ThisTask->domain.local_end[X],
		  ThisTask->domain.local_start[Y], ThisTask->domain.local_start[Y],
		  error);

  /* nbrright */
  JacobiAlgorithm(Phi, Phi0, delta,
		  ThisTask->domain.local_start[X], ThisTask->domain.local_end[X],
		  ThisTask->domain.local_end[Y],   ThisTask->domain.local_end[Y],
		  error);

  /* Round-off error fixing ??? */
  {
    *error = 0.0;
    get_error(Phi, Phi0, delta,
	      ThisTask->domain.local_start[X], ThisTask->domain.local_end[X],
	      ThisTask->domain.local_start[Y], ThisTask->domain.local_end[Y],
	      error);
  }
  
  MPI_Type_free(&column);

  return;
}

void WriteSolutionParallel(MyData **const phi, Task *const ThisTask)
{
  /* 1. Every process sets up the subarray                     */
  /*    (i.e. its own domain whithout ghost cells)             */
  /*    We will use this as arguments to MPI_File_write(),     */
  /*    which allows the program to write the file in parallel */
  MPI_Datatype subDomain;
  /* (local) sub-domain size */
  const int lsize[NDIM]  = {ThisTask->domain.dim[X], ThisTask->domain.dim[Y]};
  /* entire domain (i.e. with ghost cells) */
  const int domain[NDIM] = {lsize[X] + 2 * NGHOST, lsize[Y] + 2 * NGHOST};
  /* (local) starting coordinates of the subDomain in each dimension */
  const int lstart[NDIM] = {ThisTask->domain.local_start[X], ThisTask->domain.local_start[Y]};
  MPI_Type_create_subarray(NDIM, domain, lsize, lstart, MPI_ORDER_C, MPI_MyDatatype, &subDomain);
  MPI_Type_commit(&subDomain);

  /* 2. The file view must be set by creating a second subarray datatype, */
  /*    defining the process' view on the file.                           */  
  MPI_Datatype ProcessDomain;
  /* (global) domain size */
  const int gsize[NDIM] = {NX_GLOB, NY_GLOB};
  /* (global) starting coordinates of the subDomain in each dimension */
  const int gstart[NDIM] = {ThisTask->domain.global_start[X] - 1, ThisTask->domain.global_start[Y] - 1};
  MPI_Type_create_subarray(NDIM, gsize, lsize, gstart, MPI_ORDER_C, MPI_MyDatatype, &ProcessDomain);
  MPI_Type_commit(&ProcessDomain);

  /* 3.a Open file for writing */
  static int nfile = 0;
  char fname[128];
  sprintf(fname,"jacobi_2D_mpi_comp_comm_io_%02d.bin", nfile);
  /* delete the file if already exists */
  /* MPI_File_delete(fname, MPI_INFO_NULL); */
  MPI_File fh;
  MPI_File_open(ThisTask->comm2d, fname,
		MPI_MODE_CREATE | MPI_MODE_WRONLY,
		MPI_INFO_NULL, &fh);

  
  /* 3.b Set the process’s view of the data in the file */
  MPI_File_set_view(fh, 0, MPI_MyDatatype, ProcessDomain, "native", MPI_INFO_NULL);

  /* 3.c Writes a file starting at the locations specified by individual file pointers */
  /*      (blocking collective)                                                        */
  MPI_File_write_all(fh, phi[0], 1, subDomain, MPI_STATUS_IGNORE);

  /* 3.d Close a file (collective). */
  MPI_File_close(&fh);
  
  /* free the MPI_Datatype */
  MPI_Type_free(&subDomain);
  MPI_Type_free(&ProcessDomain);
  
  return;
}
