/* ///////////////////////////////////////////////////////////////////// */
/* Authors:  A. Mignone (mignone@to.infn.it)                             */
/*           V. Cesare  (valentina.cesare@inaf.it)                       */
/*           D. Goz     (david.goz@inaf.it)                              */
/*                                                                       */
/* Date   : June 2024                                                    */
/*                                                                       */
/* ///////////////////////////////////////////////////////////////////// */

#include "allvars.h"
#include "tools.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

/* #define DEBUG */

typedef struct MyRows
{
  int start;
  int end;
} myDomain;

/* function prototypes */
void BoundaryConditions(MyData **const restrict,
			MyData  *const restrict,
			MyData  *const restrict,
			const int,
			const int);

void JacobiAlgorithm(MyData **const restrict, MyData **const restrict, MyData *const restrict,
		     const MyData *const restrict, const int, const int, const int, const int);

void get_domains(MyData  **const buffer,
		 myDomain *const domain,
		 const int       grid_dim,
		 const int       rank,
		 const int       nranks,
		 const MPI_Comm  comm);

void mpi_exchange_1d(MyData **const buffer,
		     const int      n,
		     const int      nbrtop,
		     const int      nbrbottom,
		     const int      start,
		     const int      end,
		     const MPI_Comm comm1d);

void WriteSolution(MyData **const phi, const int nx, const int ny);

void copy_grids(MyData **const restrict A,
                MyData **const restrict B,
                const int               xbeg,
                const int               xend,
                const int               ybeg,
		const int               yend);

int main(int argc, char **argv)
{
  int rank, Nranks;
  const MPI_Comm comm = MPI_COMM_WORLD;
  MPI_Init(&argc, &argv);
  MPI_Comm_rank(comm, &rank);
  MPI_Comm_size(comm, &Nranks);

  if ((argc <= 1) && !rank)
    {
      printf("\n\t Usage: <executable> <grid_size> \n\n");
      MPI_Abort(comm, EXIT_FAILURE);
      exit(EXIT_FAILURE);
    }

  /* global X and Y grid size (square matrix supposed) */
  const int NX_GLOB = (int) strtol(argv[1], NULL, 10);
  const int NY_GLOB = NX_GLOB;

  /******************************************************************************************************/
  /* 1D MPI-decomposition:
     the physical domain is sliced into slabs along the vertical direction,
     while the grids are replicated across the MPI processes.
     This approach is not the most efficient in terms of memory usage,
     because the arrays are replicated across MPI process instead of to be distributed */

  /* get the reminder, i.e. take into account uneven
     decomposition of points among the processes    */
  const int rem = (NX_GLOB + 2) % Nranks;

  /* get the amount of data for each MPI process:
     - chunk is supposed to be >= 1 */
  const int chunk = (NX_GLOB + 2 - rem) / Nranks;
  if (chunk < 1)
    {
      printf("\n\t chunk < 1 ... aborting ...[rank: %d]\n", rank);
      MPI_Abort(comm, EXIT_FAILURE);
      exit(EXIT_FAILURE);
    }
  
  /* get the slab dimension along vertical direction: */
  int incr, offset;
  if (rank < rem)
    {
      incr = chunk + 1;
      offset = 0;
    }
  else
    {
      incr = chunk;
      offset = rem;
    }

  myDomain domain;
  domain.start = ((rank * incr) + offset);
  domain.end   = (domain.start + incr) - 1;
  /* boundaries */
  domain.start = ((domain.start == 0)         ? NGHOST  : domain.start);
  domain.end   = ((domain.end == NX_GLOB + 1) ? NX_GLOB : domain.end);

  /* rank of the top process */
  const int nbrtop    = (((rank + 1) >= Nranks) ? MPI_PROC_NULL : (rank + 1));
  /* rank of the bottom process */
  const int nbrbottom = (((rank - 1) < 0)       ? MPI_PROC_NULL : (rank - 1));

  /******************************************************************************************************/
  
  const MyData xbeg = 0.0;
  const MyData xend = 1.0;
  const MyData ybeg = 0.0;
  const MyData yend = 1.0;

  const MyData delta[NDIM] = {(xend - xbeg)/(NX_GLOB + 1),
			      (yend - ybeg)/(NY_GLOB + 1)};

  /* --------------------------------------------------------
     1. Set grid indices
     -------------------------------------------------------- */
    
  const int ibeg   = NGHOST;
  const int iend   = ibeg + NX_GLOB - 1;
  const int nx     = iend - ibeg + 1;
  const int nx_tot = nx + 2 * NGHOST;
    
  const int jbeg   = NGHOST;
  const int jend   = jbeg + NY_GLOB - 1;
  const int ny     = jend - jbeg + 1;
  const int ny_tot = ny + 2 * NGHOST;

  if (rank == MASTER)
    {
      printf("\n\t Grid indices:");
      printf("\n\t\t ibeg, iend = %d, %d; nx_tot = %d"    ,ibeg, iend, nx_tot);
      printf("\n\t\t jbeg, jend = %d, %d; ny_tot = %d\n\n",jbeg, jend, ny_tot);
    }

#if defined(DEBUG)
  for (int task=0 ; task<Nranks ; task++)
    {
      MPI_Barrier(comm);
      
      if (task == rank)
	{
	  printf("\n\t rank: %d", rank);
	  printf("\n\t\t domain.start: %d - domain.end: %d", domain.start, domain.end);
	  printf("\n\t\t nbrtop: %d - nbrbottom: %d \n", nbrtop, nbrbottom);
	  fflush(stdout);
	}
    }

  MPI_Barrier(comm);
#endif /* DEBUG */
  
  /* --------------------------------------------------------
     2. Generate grid, allocate memory
        Not optimized because the grids are (unnecessarily)
	replicated across MPI processes
     -------------------------------------------------------- */

  /* memory allocation */
  MyData *xg = (MyData *) malloc((NX_GLOB + 2*NGHOST) * sizeof(MyData));
  MyData *yg = (MyData *) malloc((NY_GLOB + 2*NGHOST) * sizeof(MyData));
  assert((xg != NULL) && (yg != NULL));

  /* initial conditions */
  for (int i=0 ; i<(NX_GLOB + 2*NGHOST) ; i++) xg[i] = xbeg + (i - ibeg + 1) * delta[X];
  for (int j=0 ; j<(NY_GLOB + 2*NGHOST) ; j++) yg[j] = ybeg + (j - jbeg + 1) * delta[Y];
  MyData *x = xg; /* Global and local grids are the same  */
  MyData *y = yg; /* for serial version of the code       */

  /* grids memory allocation */
  MyData **phi  = Allocate_2DdblArray(ny_tot, nx_tot);
  MyData **phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
    
  /* --------------------------------------------------------
     3. Initialize solution array to 0
     -------------------------------------------------------- */
    
  for (int j=jbeg ; j<=jend ; j++)
    for (int i=ibeg ; i<=iend ; i++)
      {
	phi0[j][i] = 0.0;
	phi[j][i]  = 0.0;
      }
  
  /* --------------------------------------------------------
     4. Main iteration cycle
     -------------------------------------------------------- */

  const double time_start = MPI_Wtime();
  
  /* -- 4a. Set boundary conditions first -- */  
  BoundaryConditions(phi0, x, y, nx, ny);
  BoundaryConditions(phi, x, y, nx, ny);

  MyData err = 1.0;
  /* iterations */
  int k = 0;
  while (1)
    {      
      /* -- 4c. Jacobi's method and residual (interior points) -- */
      /*       core algorithm                                     */

      err = 0.0;
      JacobiAlgorithm(phi, phi0, &err, delta,
		      ibeg, iend, domain.start, domain.end);

      if (!rank)
	printf("\n\t Iteration = %d - err = %lg\n",k, err);

      /* increase the counter of loop iterations */
      k++;

      /* get the total error */
      MyData toterr;
      /* combines values from all processes and distributes the result back to all processes */
      MPI_Allreduce(&err, &toterr, 1, MPI_MyDatatype, MPI_SUM, comm);
      
      /* check convergence */
      if (toterr <= TOL)
	{
	  /* master task gathers all the domains */
	  get_domains(phi, &domain, NY_GLOB, rank, Nranks, comm);

	  break;
	}

      /* -- 4b. MPI communications */
      mpi_exchange_1d(phi, NX_GLOB,
		      nbrtop, nbrbottom,
		      domain.start, domain.end, comm);

      /* swap the pointers */
      MyData **tmp = phi;
      phi = phi0;
      phi0 = tmp;
    }

  /* master rank writes the solution */
  if (!rank)
    {
      WriteSolution(phi, nx, ny);
      
      printf("\n\t NX_GLOB x NY_GLOB = %d x %d\n", NX_GLOB, NY_GLOB);
      printf("\n\t Time = %lf [s]\n\n", MPI_Wtime() - time_start);
    }

  // free memory
  if (phi0)
    {
      free(phi0[0]);
      free(phi0);
    }

  if (phi)
    {
      free(phi[0]);
      free(phi);
    }

  if (yg)
    free(yg);
  
  if (xg)
    free(xg);

  MPI_Finalize();
  
  return 0;
}

/* ********************************************************************* */
void BoundaryConditions(MyData **const restrict phi,
			MyData  *const restrict x,
			MyData  *const restrict y,
                        const int               nx,
			const int               ny)
/*
*********************************************************************** */
{
  const int ibeg = NGHOST;
  const int iend = ibeg + nx - 1;
    
  const int jbeg = NGHOST;
  const int jend = jbeg + ny - 1;

  int i,j;
  
  /* -- Left -- */
  i = ibeg - 1;
  for (int j=jbeg ; j<=jend ; j++)
    phi[j][i] = (1.0 - y[j]);
    
  /* -- Right -- */
  i = jend + 1;
  for (int j=jbeg ; j<=jend ; j++)
    phi[j][i] = (y[j] * y[j]);
    
  /* -- Bottom -- */    
  j = jbeg - 1;
  for (int i=ibeg ; i<=iend ; i++)
    phi[j][i] = (1.0 - x[i]);
    
  /* -- Top -- */
  j = jend + 1;
  for (int i=ibeg ; i<=iend ; i++)
    phi[j][i] = x[i];

  return;
}

/* ********************************************************************* */

void JacobiAlgorithm(MyData      **const restrict Phi,
		     MyData      **const restrict Phi0,
		     MyData       *const restrict error,
		     const MyData *const restrict delta,
		     const int                    ibeg,
		     const int                    iend,
		     const int                    jbeg,
		     const int                    jend)
{
  *error = 0.0;
  for (int j=jbeg ; j<=jend ; j++)
    {
      for (int i=ibeg ; i<=iend ; i++)
	{
	  Phi[j][i] = 0.25 * (Phi0[j][i-1] + Phi0[j][i+1] +
			      Phi0[j-1][i] + Phi0[j+1][i]);
                
	  *error += delta[X] * delta[Y] * fabs(Phi[j][i] - Phi0[j][i]);
	} /* loop over columns */
    } /* loop over rows */
  
  return;
}

/* ********************************************************************* */
void mpi_exchange_1d(MyData **const buffer,
		     const int      n,
		     const int      nbrtop,
		     const int      nbrbottom,
		     const int      start,
		     const int      end,
		     const MPI_Comm comm1d)
{
  /* The function is called by each MPI rank
     - nbrtop is the MPI process with rank + 1
     - nbrbottom is the MPI process with rank - 1 */

  /***************** First communication stage *******************/
  /* Perform a blocking send to the top (rank+1) process */
  MPI_Send(&buffer[end][1], n, MPI_MyDatatype, nbrtop, 0, comm1d);

  /* Perform a blocking receive from the bottom (rank-1) process */
  MPI_Recv(&buffer[start-1][1], n, MPI_MyDatatype,
	   nbrbottom, 0, comm1d, MPI_STATUS_IGNORE);
  /***************************************************************/

  /**************** Second communication stage *******************/
  /* - Perform a blocking send to the bottom (rank-1) process */
  MPI_Send(&buffer[start][1], n, MPI_MyDatatype, nbrbottom, 1, comm1d);

  /* Perform a blocking receive from the top (rank+1) process */
  MPI_Recv(&buffer[end+1][1], n, MPI_MyDatatype,
	   nbrtop, 1, comm1d, MPI_STATUS_IGNORE);
  /***************************************************************/
  
  return;
}

/* ********************************************************************* */

void get_domains(MyData  **const buffer,
		 myDomain *const domain,
		 const int       grid_dim,
		 const int       rank,
		 const int       nranks,
		 const MPI_Comm  comm)
{
  /* Master process gathers all the domains */
  
  /***************************** get the domain boundaries from each process *************************************/
  myDomain *boundaries = NULL;
  if (rank == MASTER)
    {
      boundaries = (myDomain *)malloc(nranks * sizeof(*boundaries));
      if (boundaries == NULL)
	{
	  MPI_Abort(comm, EXIT_FAILURE);
	  exit(EXIT_FAILURE);
	}
      boundaries[MASTER].start = domain->start;
      boundaries[MASTER].end   = domain->end;
    }

  for (int task=0 ; task<nranks ; task++)
    {
      if (rank == MASTER)
	{
	  if (task)
	    {
	      MPI_Status status;
	      MPI_Recv((void *)&boundaries[task], sizeof(myDomain), MPI_BYTE, task, 0, comm, &status);
	      if ((status.MPI_ERROR != MPI_SUCCESS) || (status.MPI_SOURCE != task))
		{
		  free(boundaries);
		  MPI_Abort(comm, EXIT_FAILURE);
		  exit(EXIT_FAILURE);
		}
	    }
#if defined(DEBUG)
	  printf("\n\t Diplacements[%d].start = %d - Diplacements[%d].end = %d",
		 task, boundaries[task].start, task, boundaries[task].end);
#endif /* DEBUG */	  
	} /* MASTER */
      else if (task == rank)
	{
	  MPI_Send((void *)domain, sizeof(myDomain), MPI_BYTE, MASTER, 0, comm);
	}
    } /* loop over nranks */

#if defined(DEBUG)
  if (rank == MASTER)
    {
      printf("\n");
      fflush(stdout);
    }
#endif /* DEBUG */

  /**************************************************************************************************/

  /***************************************** get the domain from each process *************************/

  for (int task=0 ; task<nranks ; task++)
    {
      if (rank == MASTER)
	{
	  if (task)
	    {
	      MPI_Status status;
	      /* number of grid points to receive (including ghost points) */
	      const int nrows = (boundaries[task].end - boundaries[task].start + 1);
	      const int elements = (nrows * (grid_dim + 2));
	      MPI_Recv((void *)&buffer[boundaries[task].start][0], elements, MPI_MyDatatype, task, 0, comm, &status);
	      if ((status.MPI_ERROR != MPI_SUCCESS) || (status.MPI_SOURCE != task))
		{
		  free(boundaries);
		  MPI_Abort(comm, EXIT_FAILURE);
		  exit(EXIT_FAILURE);
		}
	    }
	} /* MASTER */
      else if (task == rank)
	{
	  const int nrows = (domain->end - domain->start + 1);
	  const int elements = (nrows * (grid_dim + 2));
	  MPI_Send((void *)&buffer[domain->start][0], elements, MPI_MyDatatype, MASTER, 0, comm);
	}
    } /* loop over nranks */  
  
  /***************************************************************************************************/
  
  if (rank == MASTER)
    free(boundaries);
  
  return;
}

/* ********************************************************************* */
void WriteSolution(MyData **const phi,
		   const int      nx,
		   const int      ny)
/*
*********************************************************************** */
{
  const int ibeg = NGHOST;    
  const int jbeg = NGHOST;
  const int jend = jbeg + ny - 1;
    
  static int nfile = 0;  /* File counter */

  char fname[32];
  sprintf(fname,"jacobi2D_mpi_send_recv_blocking_%02d.bin", nfile);
    
  FILE *fp;
  printf ("> Writing %s\n",fname);
  fp = fopen(fname, "wb");

  /* discard boundaies */
  for (int j=jbeg ; j<=jend ; j++)
    {
      fwrite (phi[j] + ibeg, sizeof(MyData), nx, fp);
    }
    
  nfile++;
  fclose(fp);
}
