From 2fb621e00f30da096cb7f32ba3012d911b36426f Mon Sep 17 00:00:00 2001
From: David Goz <david.goz@inaf.it>
Date: Thu, 20 Jun 2024 19:27:16 +0200
Subject: [PATCH] comp_comm debugging phase

---
 jacobi/mpi/comp_comm/Makefile                 |   3 +
 jacobi/mpi/comp_comm/include/allvars.h        |   2 +-
 jacobi/mpi/comp_comm/include/tools.h          |   2 +-
 jacobi/mpi/comp_comm/make.def                 |   2 +-
 .../comp_comm/src/jacobi_2D_mpi_comp_comm.c   | 384 ++++++++++--------
 jacobi/mpi/comp_comm/src/tools.c              |   2 +-
 6 files changed, 224 insertions(+), 171 deletions(-)

diff --git a/jacobi/mpi/comp_comm/Makefile b/jacobi/mpi/comp_comm/Makefile
index 02695b4..9a158f6 100644
--- a/jacobi/mpi/comp_comm/Makefile
+++ b/jacobi/mpi/comp_comm/Makefile
@@ -18,6 +18,7 @@ info:
 	@echo ' '
 	@echo '-----------------------------------------------------------------------------------------'
 	@echo '$$ make                     ---> compile the mpi application                             '
+	@echo '$$ make debug               ---> compile the mpi application for debugging               '
 	@echo '$$ make valgrind_memcheck   ---> run the mpi application using Valgrind under Memcheck   '
 	@echo '$$ make valgrind_callgrind  ---> run the mpi application using Valgrind under Callgrind  '
 	@echo '$$ make valgrind_cachegrind ---> run the mpi application using Valgrind under Cachegrind '
@@ -28,6 +29,8 @@ info:
 
 mpi: $(PROG)
 
+debug: $(PROG_DEBUG)
+
 valgrind_memcheck: $(PROG_MEMCHECK)
 	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
 	mpirun -n 4 valgrind --tool=memcheck -s --default-suppressions=yes --log-file=valgrind_memcheck_log_%p.txt ./$< 9 2
diff --git a/jacobi/mpi/comp_comm/include/allvars.h b/jacobi/mpi/comp_comm/include/allvars.h
index 798e4e6..f6f93c1 100644
--- a/jacobi/mpi/comp_comm/include/allvars.h
+++ b/jacobi/mpi/comp_comm/include/allvars.h
@@ -8,7 +8,7 @@
 
 #include <mpi.h>
 
-#define MASTER 0
+#define MASTERTASK 0
 
 #if defined(SINGLE_PRECISION)
 typedef float MyData;
diff --git a/jacobi/mpi/comp_comm/include/tools.h b/jacobi/mpi/comp_comm/include/tools.h
index 82859ec..0167c89 100644
--- a/jacobi/mpi/comp_comm/include/tools.h
+++ b/jacobi/mpi/comp_comm/include/tools.h
@@ -10,7 +10,7 @@
 
 /* function prototypes */
 MyData **Allocate_2DdblArray(const int nx, const int ny);
-void Show_2DdblArray(const MyData **const A,
+void Show_2DdblArray(      MyData **const A,
                      const int            nx,
                      const int            ny,
                      const char *const    string);
diff --git a/jacobi/mpi/comp_comm/make.def b/jacobi/mpi/comp_comm/make.def
index 082a947..bacfcdd 100644
--- a/jacobi/mpi/comp_comm/make.def
+++ b/jacobi/mpi/comp_comm/make.def
@@ -21,7 +21,7 @@ $(PROG): $(DEPENDENCIES)
 	@echo ' '
 
 $(PROG_DEBUG): $(DEPENDENCIES)
-	$(CC) $(CFLAGS) -Og -ggdb3 -fno-omit-frame-pointer -I./include I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	$(CC) $(CFLAGS) -Og -ggdb3 -fno-omit-frame-pointer -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
 	@echo ' '
 	@echo 'Program' $(PROG_DEBUG) 'compiled for' $(SYSTYPE) 'machine'
 	@echo ' '
diff --git a/jacobi/mpi/comp_comm/src/jacobi_2D_mpi_comp_comm.c b/jacobi/mpi/comp_comm/src/jacobi_2D_mpi_comp_comm.c
index 234d06a..bbd6ffb 100644
--- a/jacobi/mpi/comp_comm/src/jacobi_2D_mpi_comp_comm.c
+++ b/jacobi/mpi/comp_comm/src/jacobi_2D_mpi_comp_comm.c
@@ -14,12 +14,17 @@
 #include <math.h>
 #include <string.h>
 
-/* #define DEBUG */
+#define DEBUG
+
+static int NX_GLOB, NY_GLOB;
+MyData **global_phi;
 
 typedef struct MyGrid
 {
-  int start[NDIM];
-  int end[NDIM];
+  int local_start[NDIM];
+  int local_end[NDIM];
+  int global_start[NDIM];
+  int global_end[NDIM];
   int dim[NDIM];
 } myDomain;
 
@@ -34,14 +39,14 @@ typedef struct Task_2D_Cartesian
   int nbrbottom;
   int nbrleft;
   int nbrright;
+  MPI_Comm comm2d;
 } Task;
 
 /* function prototypes */
 void BoundaryConditions(MyData **const restrict,
 			MyData  *const restrict,
 			MyData  *const restrict,
-			const int,
-			const int);
+			Task    *const restrict);
 
 void JacobiAlgorithm(MyData **const restrict, MyData **const restrict, const MyData *restrict,
 		     const int, const int, const int, const int, MyData *const restrict);
@@ -50,13 +55,10 @@ void Jacobi_Communication(MyData      **const restrict Phi,
 			  MyData      **const restrict Phi0,
 			  MyData       *const restrict error,
 			  const MyData *      restrict delta,
-			  const int                    grid_y,
-			  Task         *const restrict ThisTask,
-			  MPI_Comm     *const restrict comm);
+			  Task         *const restrict ThisTask);
 
 void get_domains(MyData  **const buffer,
-		 Task     *const ThisTask,
-		 const MPI_Comm  comm);
+		 Task     *const ThisTask);
 
 void WriteSolution(MyData **const phi, const int nx, const int ny);
 
@@ -76,8 +78,8 @@ int main(int argc, char **argv)
     }
 
   /* global X and Y grid size (square matrix supposed) */
-  const int NX_GLOB = (int) strtol(argv[1], NULL, 10);
-  const int NY_GLOB = NX_GLOB;
+  NX_GLOB = (int) strtol(argv[1], NULL, 10);
+  NY_GLOB = NX_GLOB;
 
   /********************************** Cartesin topology *******************************************************/
   const int cartesian_grid_x = (int)strtol(argv[2], NULL, 10);
@@ -99,9 +101,10 @@ int main(int argc, char **argv)
                                     /* neighbors in the actual hardware for better performance */
 
   /* make a new communicator to which topology information has been attached */
-  MPI_Comm comm2d = MPI_COMM_NULL;
-  MPI_Cart_create(MPI_COMM_WORLD, NDIM, dims, periods, reorder, &comm2d);
-  if (comm2d == MPI_COMM_NULL)
+  Task ThisTask;
+  ThisTask.comm2d = MPI_COMM_NULL;
+  MPI_Cart_create(MPI_COMM_WORLD, NDIM, dims, periods, reorder, &ThisTask.comm2d);
+  if (ThisTask.comm2d == MPI_COMM_NULL)
     {
       printf("\n\t Process %d is not part of the new communicator \n", rank);
       fflush(stdout);
@@ -109,22 +112,22 @@ int main(int argc, char **argv)
       exit(EXIT_FAILURE);
     }
 
-  Task ThisTask;
+
 
   /* get the comm size */
-  MPI_Comm_size(comm2d, &ThisTask.nranks);
+  MPI_Comm_size(ThisTask.comm2d, &ThisTask.nranks);
   
   /* determine the process coords in cartesian topology given rank in group */
-  MPI_Cart_coords(comm2d, rank, NDIM, ThisTask.coords);
+  MPI_Cart_coords(ThisTask.comm2d, rank, NDIM, ThisTask.coords);
 
   /* determines process rank in communicator given Cartesian location */
-  MPI_Cart_rank(comm2d, ThisTask.coords, &ThisTask.rank);
+  MPI_Cart_rank(ThisTask.comm2d, ThisTask.coords, &ThisTask.rank);
   
   /* get bottom and top neighbors (X direction) */
-  MPI_Cart_shift(comm2d, X, 1, &ThisTask.nbrbottom, &ThisTask.nbrtop);
+  MPI_Cart_shift(ThisTask.comm2d, X, 1, &ThisTask.nbrbottom, &ThisTask.nbrtop);
 
   /* get left and right neighbors (Y direction) */
-  MPI_Cart_shift(comm2d, Y, 1, &ThisTask.nbrleft, &ThisTask.nbrright);
+  MPI_Cart_shift(ThisTask.comm2d, Y, 1, &ThisTask.nbrleft, &ThisTask.nbrright);
   
   /************************************************************************************************************/
   /* 2D MPI-cartesian decomposition:
@@ -144,7 +147,7 @@ int main(int argc, char **argv)
   if ((chunk[X] < 1) || (chunk[Y] < 1))
     {
       printf("\n\t chunk[X] < 1 || chunk[Y] < 1 ... aborting ...[rank: %d]\n", rank);
-      MPI_Comm_free(&comm2d);
+      MPI_Comm_free(&ThisTask.comm2d);
       MPI_Abort(comm, EXIT_FAILURE);
       exit(EXIT_FAILURE);
     }
@@ -168,15 +171,20 @@ int main(int argc, char **argv)
   /* subdomain managed by the task */
   for (int dim=0 ; dim<NDIM ; dim++)
     {
-      ThisTask.domain.start[dim] = ((ThisTask.coords[dim] * incr[dim]) + offset[dim]);
-      ThisTask.domain.end[dim]   = (ThisTask.domain.start[dim] + incr[dim]) - 1;
+      ThisTask.domain.global_start[dim] = ((ThisTask.coords[dim] * incr[dim]) + offset[dim]);
+      ThisTask.domain.global_end[dim]   = (ThisTask.domain.global_start[dim] + incr[dim]) - 1;
 
       /* boundaries */
-      ThisTask.domain.start[dim] = ((ThisTask.domain.start[dim] == 0)         ? NGHOST  : ThisTask.domain.start[dim]);
-      ThisTask.domain.end[dim]   = ((ThisTask.domain.end[dim] == NX_GLOB + 1) ? NX_GLOB : ThisTask.domain.end[dim]);
+      ThisTask.domain.global_start[dim] = ((ThisTask.domain.global_start[dim] == 0)         ? NGHOST  : ThisTask.domain.global_start[dim]);
+      ThisTask.domain.global_end[dim]   = ((ThisTask.domain.global_end[dim] == NX_GLOB + 1) ? NX_GLOB : ThisTask.domain.global_end[dim]);
+
+      ThisTask.domain.dim[X] = (ThisTask.domain.global_end[X] - ThisTask.domain.global_start[X] + 1);
+      ThisTask.domain.dim[Y] = (ThisTask.domain.global_end[Y] - ThisTask.domain.global_start[Y] + 1);
 
-      ThisTask.domain.dim[X] = (ThisTask.domain.end[X] - ThisTask.domain.start[X]);
-      ThisTask.domain.dim[Y] = (ThisTask.domain.end[Y] - ThisTask.domain.start[Y]);
+      /* local index */
+      ThisTask.domain.local_start[X] = ThisTask.domain.local_start[Y] = 1;
+      ThisTask.domain.local_end[X]   = ThisTask.domain.dim[X];
+      ThisTask.domain.local_end[Y]   = ThisTask.domain.dim[Y];
     }
 
 #if defined(DEBUG)
@@ -186,13 +194,16 @@ int main(int argc, char **argv)
 	{
 	  printf("\n\t rank: %d", task);
 	  printf("\n\t\t coords = [%d, %d]", ThisTask.coords[X], ThisTask.coords[Y]);
-	  printf("\n\t\t domain.start[X] = %d - domain.end[X] = %d", ThisTask.domain.start[X], ThisTask.domain.end[X]);
-	  printf("\n\t\t domain.start[Y] = %d - domain.end[Y] = %d", ThisTask.domain.start[Y], ThisTask.domain.end[Y]);
+	  printf("\n\t\t domain.global_start[X] = %d - domain.global_end[X] = %d", ThisTask.domain.global_start[X], ThisTask.domain.global_end[X]);
+	  printf("\n\t\t domain.global_start[Y] = %d - domain.global_end[Y] = %d", ThisTask.domain.global_start[Y], ThisTask.domain.global_end[Y]);
+	  printf("\n\t\t domain.local_start[X]  = %d - domain.local_end[X]  = %d", ThisTask.domain.local_start[X],  ThisTask.domain.local_end[X]);
+	  printf("\n\t\t domain.local_start[Y]  = %d - domain.local_end[Y]  = %d", ThisTask.domain.local_start[Y],  ThisTask.domain.local_end[Y]);
+	  printf("\n\t\t domain.dim[X]          = %d - domain.dim[Y] = %d", ThisTask.domain.dim[X], ThisTask.domain.dim[Y]);
 	  printf("\n\t\t nbrtop  = %d - nbrbottom = %d",  ThisTask.nbrtop,  ThisTask.nbrbottom);
 	  printf("\n\t\t nbrleft = %d - nbrright  = %d\n", ThisTask.nbrleft, ThisTask.nbrright);
 	  fflush(stdout);
 	}
-      MPI_Barrier(comm2d);
+      MPI_Barrier(ThisTask.comm2d);
     }
 #endif /* DEBUG */
   
@@ -242,22 +253,16 @@ int main(int argc, char **argv)
   for (int j=0 ; j<(NY_GLOB + 2*NGHOST) ; j++) yg[j] = ybeg + (j - jbeg + 1) * delta[Y];
 
   /* grids memory allocation distributed across MPI processes */
-  MyData **phi  = Allocate_2DdblArray(ThisTask.domain.dim[Y] + 2, ThisTask.domain.dim[X] + 2);
-  MyData **phi0 = Allocate_2DdblArray(ThisTask.domain.dim[Y] + 2, ThisTask.domain.dim[X] + 2);
+  MyData **phi  = Allocate_2DdblArray(ThisTask.domain.dim[X] + 2, ThisTask.domain.dim[Y] + 2);
+  MyData **phi0 = Allocate_2DdblArray(ThisTask.domain.dim[X] + 2, ThisTask.domain.dim[Y] + 2);
 
   /* --------------------------------------------------------
-     3. Set boundary conditions   
+     3. Set boundary conditions
      -------------------------------------------------------- */
-
-  BoundaryConditions(phi0, xg, yg, nx, ny);
-  BoundaryConditions(phi,  xg, yg, nx, ny);
-
-
-
-
-
-
-  
+  BoundaryConditions(phi0, xg, yg, &ThisTask);
+  BoundaryConditions(phi,  xg, yg, &ThisTask);
+  free(yg);
+  free(xg);
   
   /* --------------------------------------------------------
      4. Main iteration cycle
@@ -265,35 +270,30 @@ int main(int argc, char **argv)
 
   const double time_start = MPI_Wtime();
   
-  /* -- 4a. Set boundary conditions first -- */  
-
-  MyData err = 1.0;
   /* iterations */
   int k = 0;
   while (1)
     {
-      /* -- 4c. Jacobi's method and residual (interior points) -- */
-      /*       core algorithm                                     */
-
-      err = 0.0;
-      Jacobi_Communication(phi, phi0, &err, delta, NY_GLOB, &ThisTask, &comm2d);
+      /* -- 4a. Jacobi algorithm overlapping computation and communication */
+      MyData err;
+      Jacobi_Communication(phi, phi0, &err, delta, &ThisTask);
 
-      if (!rank)
-	printf("\n\t Iteration = %d - err = %lg\n",k, err);
+      /* -- 4b. Get the total error                                                                 */
+      /*        combines values from all processes and distributes the result back to all processes */
+      MyData toterr;
+      MPI_Allreduce(&err, &toterr, 1, MPI_MyDatatype, MPI_SUM, ThisTask.comm2d);
+      
+      if (ThisTask.rank == MASTERTASK)
+	printf("\n\t Iteration = %d - err = %lg\n",k, toterr);
 
       /* increase the counter of loop iterations */
       k++;
-
-      /* get the total error */
-      MyData toterr;
-      /* combines values from all processes and distributes the result back to all processes */
-      MPI_Allreduce(&err, &toterr, 1, MPI_MyDatatype, MPI_SUM, comm2d);
       
       /* check convergence */
       if (toterr <= TOL)
 	{
 	  /* master task gathers all the domains */
-	  get_domains(phi, &ThisTask, comm2d);
+	  get_domains(phi, &ThisTask);
 
 	  break;
 	}
@@ -305,9 +305,9 @@ int main(int argc, char **argv)
     }
 
   /* master rank writes the solution */
-  if (!rank)
+  if (ThisTask.rank == MASTERTASK)
     {
-      WriteSolution(phi, nx, ny);
+      WriteSolution(global_phi, NX_GLOB, NY_GLOB);
       
       printf("\n\t NX_GLOB x NY_GLOB = %d x %d\n", NX_GLOB, NY_GLOB);
       printf("\n\t Time = %lf [s]\n\n", MPI_Wtime() - time_start);
@@ -326,13 +326,11 @@ int main(int argc, char **argv)
       free(phi);
     }
 
-  if (yg)
-    free(yg);
+  if (ThisTask.rank == MASTERTASK)
+    if (global_phi)
+      free(global_phi);
   
-  if (xg)
-    free(xg);
-
-  MPI_Comm_free(&comm2d);
+  MPI_Comm_free(&ThisTask.comm2d);
   
   MPI_Finalize();
   
@@ -343,71 +341,74 @@ int main(int argc, char **argv)
 void BoundaryConditions(MyData **const restrict phi,
 			MyData  *const restrict x,
 			MyData  *const restrict y,
-                        const int               nx,
-			const int               ny,
 			Task    *const restrict ThisTask)
-/**********************************************************************************/
+/*************************************************************************/
 {
   /* left */
   if (ThisTask->nbrleft == MPI_PROC_NULL) /* no left neighbor */
     {
-      for (int j=ThisTask->domain.start[X] ; j<ThisTask->domain.end[X] ; j++)
-	phi[j][0] = (1.0 - y[j]);
+      const int global_start_j = ThisTask->domain.global_start[X];
+      for (int j=ThisTask->domain.local_start[X] ; j<=ThisTask->domain.local_end[X] ; j++)
+	phi[j][0] = (1.0 - y[global_start_j + j - 1]);
     }
 
   /* right */
   if (ThisTask->nbrright == MPI_PROC_NULL) /* no right neighbor */
     {
-      for (int j=ThisTask->domain.start[X] ; j<ThisTask->domain.end[X] ; j++)
-	phi[j][ThisTask->domain.end[Y] + 1] = (y[j] * y[j]);
+      const int global_start_j = ThisTask->domain.global_start[X];
+      for (int j=ThisTask->domain.local_start[X] ; j<=ThisTask->domain.local_end[X] ; j++)
+	phi[j][ThisTask->domain.local_end[Y] + 1] = (y[global_start_j + j - 1] * y[global_start_j + j - 1]);
     }
-
+  
   /* bottom */
   if (ThisTask->nbrbottom == MPI_PROC_NULL) /* no bottom neighbor */
     {
-      for (int i=ThisTask->domain.start[Y] ; i<ThisTask->domain.end[Y] ; i++)
-	phi[0][i] = (1.0 - x[i]);
+      const int global_start_i = ThisTask->domain.global_start[Y];
+      for (int i=ThisTask->domain.local_start[Y] ; i<=ThisTask->domain.local_end[Y] ; i++)
+	phi[0][i] = (1.0 - x[global_start_i + i - 1]);
     }
   
   /* top */
-  if (ThisTask->nbrtop == MPI_PROC_NULL) /* no top neghbor */
+  if (ThisTask->nbrtop == MPI_PROC_NULL) /* no top neighbor */
     {
-      for (int i=ThisTask->domain.start[Y] ; i<ThisTask->domain.end[Y] ; i++)
-	phi[ThisTask->domain.end[X] + 1][i] = x[i];
+      const int global_start_i = ThisTask->domain.global_start[Y];
+      for (int i=ThisTask->domain.local_start[Y] ; i<=ThisTask->domain.local_end[Y] ; i++)
+	phi[ThisTask->domain.local_end[X] + 1][i] = x[global_start_i + i - 1];
     }
 
+#if defined(DEBUG)
 
-
-  return;
+  if (ThisTask->rank == MASTERTASK)
+    {
+      printf("\n");
+      for (int i=0 ; i<NX_GLOB+2*NGHOST ; i++)
+	printf("\n\t x[%d] = %8.2f", i, x[i]);
   
-  const int ibeg = NGHOST;
-  const int iend = ibeg + nx - 1;
-    
-  const int jbeg = NGHOST;
-  const int jend = jbeg + ny - 1;
+      printf("\n");
+      for (int i=0 ; i<NX_GLOB+2*NGHOST ; i++)
+	printf("\n\t y[%d] = %8.2f", i, y[i]);
 
-  int i,j;
-  
-  /* -- Left -- */
-  i = ibeg - 1;
-  for (int j=jbeg ; j<=jend ; j++)
-    phi[j][i] = (1.0 - y[j]);
-    
-  /* -- Right -- */
-  i = jend + 1;
-  for (int j=jbeg ; j<=jend ; j++)
-    phi[j][i] = (y[j] * y[j]);
-    
-  /* -- Bottom -- */    
-  j = jbeg - 1;
-  for (int i=ibeg ; i<=iend ; i++)
-    phi[j][i] = (1.0 - x[i]);
-    
-  /* -- Top -- */
-  j = jend + 1;
-  for (int i=ibeg ; i<=iend ; i++)
-    phi[j][i] = x[i];
+      printf("\n");
+      fflush(stdout);
+    }
+
+  MPI_Barrier(ThisTask->comm2d);
 
+  for (int task=0 ; task<ThisTask->nranks ; task++)
+    {
+      if (task == ThisTask->rank)
+	{
+	  char string[128];
+	  sprintf(string, "Task %d - Phi", task);
+	  
+	  Show_2DdblArray(phi, ThisTask->domain.dim[X] + 2, ThisTask->domain.dim[Y] + 2, string);
+	}
+      
+      MPI_Barrier(ThisTask->comm2d);
+    }
+  
+#endif /* DEBUG */  
+  
   return;
 }
 
@@ -436,22 +437,19 @@ void JacobiAlgorithm(MyData **const restrict Phi,
   return;
 }
 
-
 void Jacobi_Communication(MyData      **const restrict Phi,
 			  MyData      **const restrict Phi0,
 			  MyData       *const restrict error,
 			  const MyData *      restrict delta,
-			  const int                    grid_y,
-			  Task         *const restrict ThisTask,
-			  MPI_Comm     *const restrict comm)
+			  Task         *const restrict ThisTask)
 {
   /* custom datatype */
   MPI_Datatype column;
-  MPI_Type_vector((ThisTask->domain.end[X] - ThisTask->domain.start[X] + 1), 1, grid_y + 2, MPI_MyDatatype, &column);
+  MPI_Type_vector(ThisTask->domain.dim[X], 1, ThisTask->domain.dim[Y] + 2, MPI_MyDatatype, &column);
   /* commits the datatype */
   MPI_Type_commit(&column);
 
-  const int data_row_size = (ThisTask->domain.end[Y] - ThisTask->domain.start[Y] + 1);
+  const int data_row_size = ThisTask->domain.dim[Y];
   
   /* First task: issue the communication */
   MPI_Request request[8];
@@ -459,30 +457,67 @@ void Jacobi_Communication(MyData      **const restrict Phi,
   MyData **const restrict buffer = Phi0;
   
   /* 4 nonblocking MPI_Irecv */
-  MPI_Irecv(&buffer[ThisTask->domain.start[X] - 1][ThisTask->domain.start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrbottom, 0, *comm, &request[0]);
-  MPI_Irecv(&buffer[ThisTask->domain.end[X]   + 1][ThisTask->domain.start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrtop,    1, *comm, &request[1]);
-  MPI_Irecv(&buffer[ThisTask->domain.start[X]    ][ThisTask->domain.start[Y] - 1], 1,             column,         ThisTask->nbrleft,   2, *comm, &request[2]);
-  MPI_Irecv(&buffer[ThisTask->domain.start[X]    ][ThisTask->domain.end[Y]   + 1], 1,             column,         ThisTask->nbrright,  3, *comm, &request[3]);
+  MPI_Irecv(&buffer[ThisTask->domain.local_start[X] - 1][ThisTask->domain.local_start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrbottom, 0, ThisTask->comm2d, &request[0]);
+  MPI_Irecv(&buffer[ThisTask->domain.local_end[X]   + 1][ThisTask->domain.local_start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrtop,    1, ThisTask->comm2d, &request[1]);
+  MPI_Irecv(&buffer[ThisTask->domain.local_start[X]    ][ThisTask->domain.local_start[Y] - 1], 1,             column,         ThisTask->nbrleft,   2, ThisTask->comm2d, &request[2]);
+  MPI_Irecv(&buffer[ThisTask->domain.local_start[X]    ][ThisTask->domain.local_end[Y]   + 1], 1,             column,         ThisTask->nbrright,  3, ThisTask->comm2d, &request[3]);
 
   /* 4 nonblocking MPI_Isend */
-  MPI_Isend(&buffer[ThisTask->domain.end[X]      ][ThisTask->domain.start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrtop,    0, *comm, &request[4]);
-  MPI_Isend(&buffer[ThisTask->domain.start[X]    ][ThisTask->domain.start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrbottom, 1, *comm, &request[5]);
-  MPI_Isend(&buffer[ThisTask->domain.start[X]    ][ThisTask->domain.end[Y]      ], 1,             column,         ThisTask->nbrright,  2, *comm, &request[6]);
-  MPI_Isend(&buffer[ThisTask->domain.start[X]    ][ThisTask->domain.start[Y]    ], 1,             column,         ThisTask->nbrleft,   3, *comm, &request[7]);
+  MPI_Isend(&buffer[ThisTask->domain.local_end[X]  ][ThisTask->domain.local_start[Y]], data_row_size, MPI_MyDatatype, ThisTask->nbrtop,    0, ThisTask->comm2d, &request[4]);
+  MPI_Isend(&buffer[ThisTask->domain.local_start[X]][ThisTask->domain.local_start[Y]], data_row_size, MPI_MyDatatype, ThisTask->nbrbottom, 1, ThisTask->comm2d, &request[5]);
+  MPI_Isend(&buffer[ThisTask->domain.local_start[X]][ThisTask->domain.local_end[Y]  ], 1,             column,         ThisTask->nbrright,  2, ThisTask->comm2d, &request[6]);
+  MPI_Isend(&buffer[ThisTask->domain.local_start[X]][ThisTask->domain.local_start[Y]], 1,             column,         ThisTask->nbrleft,   3, ThisTask->comm2d, &request[7]);
 
   /**************************************** computation ****************************************/
   /* perform the computation with the local data, (i.e. ghost cells are not required) */
   /* so overlapping computation and communication */
 
-  const int jbeg = ThisTask->domain.start[X] + 1;
-  const int jend = ThisTask->domain.end[X]   - 1;
-  const int ibeg = ThisTask->domain.start[Y] + 1;
-  const int iend = ThisTask->domain.end[Y]   - 1;
+  const int jbeg = ThisTask->domain.local_start[X] + 1;
+  const int jend = ThisTask->domain.local_end[X]   - 1;
+  const int ibeg = ThisTask->domain.local_start[Y] + 1;
+  const int iend = ThisTask->domain.local_end[Y]   - 1;
+  
+#if defined(DEBUG)
+
+  MPI_Waitall(8, request, MPI_STATUSES_IGNORE);
 
   *error = 0.0;
-  JacobiAlgorithm(Phi, Phi0, delta, jbeg, jend, ibeg, iend, error);
+  /* JacobiAlgorithm(Phi, Phi0, delta, jbeg, jend, ibeg, iend, error); */
+  JacobiAlgorithm(Phi, Phi0, delta,
+		  ThisTask->domain.local_start[X]+1, ThisTask->domain.local_end[X]-1,
+		  ThisTask->domain.local_start[Y]  , ThisTask->domain.local_end[Y],
+		  error);
+
+  /* /\* data from nbrbottom have been received *\/ */
+  JacobiAlgorithm(Phi, Phi0, delta,
+		  ThisTask->domain.local_start[X], ThisTask->domain.local_start[X],
+		  ThisTask->domain.local_start[Y], ThisTask->domain.local_end[Y],
+		  error);
+
+  /* /\* data from nbrtop have been received *\/ */
+  JacobiAlgorithm(Phi, Phi0, delta,
+		  ThisTask->domain.local_end[X],   ThisTask->domain.local_end[X],
+		  ThisTask->domain.local_start[Y], ThisTask->domain.local_end[Y],
+		  error);
+
+  /* /\* data from nbrleft have been received *\/ */
+  /* JacobiAlgorithm(Phi, Phi0, delta, */
+  /* 		  ThisTask->domain.local_start[X], ThisTask->domain.local_end[X], */
+  /* 		  ThisTask->domain.local_start[Y], ThisTask->domain.local_start[Y], */
+  /* 		  error); */
+
+  /* /\* data from nbrright have been received *\/ */
+  /* JacobiAlgorithm(Phi, Phi0, delta, */
+  /* 		  ThisTask->domain.local_start[X], ThisTask->domain.local_end[X], */
+  /* 		  ThisTask->domain.local_end[Y],   ThisTask->domain.local_end[Y], */
+  /* 		  error); */
+
+#else
   
   /**********************************************************************************************/
+
+  *error = 0.0;
+  JacobiAlgorithm(Phi, Phi0, delta, jbeg, jend, ibeg, iend, error);
   
   /* waits for any specified MPI Request to complete */
   for (int req=0 ; req<8 ; req++)
@@ -497,28 +532,42 @@ void Jacobi_Communication(MyData      **const restrict Phi,
 	  /* communication with tag 0 completed */
 	case 0:
 	  /* data from nbrbottom have been received */
-	  JacobiAlgorithm(Phi, Phi0, delta, (jbeg - 1), (jbeg - 1), (ibeg - 1) , (iend + 1), error);
+	  JacobiAlgorithm(Phi, Phi0, delta,
+			  ThisTask->domain.local_start[X], ThisTask->domain.local_start[X],
+			  ThisTask->domain.local_start[Y], ThisTask->domain.local_end[Y],
+			  error);
 	  break;
 
 	  /* communication with tag 1 completed */
 	case 1:
 	  /* data from nbrtop have been received */
-	  JacobiAlgorithm(Phi, Phi0, delta, (jend + 1), (jend + 1), (ibeg - 1) , (iend + 1), error);
+	  JacobiAlgorithm(Phi, Phi0, delta,
+			  ThisTask->domain.local_end[X],   ThisTask->domain.local_end[X],
+			  ThisTask->domain.local_start[Y], ThisTask->domain.local_end[Y],
+			  error);
 	  break;
 
 	  /* communication with tag 2 completed */
 	case 2:
 	  /* data from nbrleft have been received */
-	  JacobiAlgorithm(Phi, Phi0, delta, (jbeg - 1), (jend + 1), (ibeg - 1), (ibeg - 1), error);
+	  JacobiAlgorithm(Phi, Phi0, delta,
+			  ThisTask->domain.local_start[X], ThisTask->domain.local_end[X],
+			  ThisTask->domain.local_start[Y], ThisTask->domain.local_start[Y],
+			  error);
 	  break;
 
 	  /* communication with tag 3 completed */
 	case 3:
 	  /* data from nbrright have been received */
-	  JacobiAlgorithm(Phi, Phi0, delta, (jbeg - 1), (jend + 1), (iend + 1), (iend + 1), error);
+	  JacobiAlgorithm(Phi, Phi0, delta,
+			  ThisTask->domain.local_start[X], ThisTask->domain.local_end[X],
+			  ThisTask->domain.local_end[Y],   ThisTask->domain.local_end[Y],
+			  error);
 	  break;
 	}
     }
+
+#endif /* DEBUG */
   
   MPI_Type_free(&column);
 
@@ -528,14 +577,13 @@ void Jacobi_Communication(MyData      **const restrict Phi,
 /* ********************************************************************* */
 
 void get_domains(MyData  **const buffer,
-		 Task     *const ThisTask,
-		 const MPI_Comm  comm)
+		 Task     *const ThisTask)
 {
   /* Master process gathers all the domains */
   
   /***************************** get the domain boundaries from each process *************************************/
   myDomain *boundaries = NULL;
-  if (ThisTask->rank == MASTER)
+  if (ThisTask->rank == MASTERTASK)
     {
       boundaries = (myDomain *)malloc(ThisTask->nranks * sizeof(*boundaries));
       assert(boundaries != NULL);
@@ -543,7 +591,7 @@ void get_domains(MyData  **const buffer,
 
   MPI_Gather(&ThisTask->domain, sizeof(myDomain), MPI_BYTE,
 	     boundaries,        sizeof(myDomain), MPI_BYTE,
-	     MASTER, comm);
+	     MASTERTASK, ThisTask->comm2d);
 
   /**************************************************************************************************/
   
@@ -561,7 +609,7 @@ void get_domains(MyData  **const buffer,
   MyData *recvbuf = NULL;
   MyData *sendbuf = NULL;
   
-  if (ThisTask->rank == MASTER)
+  if (ThisTask->rank == MASTERTASK)
     {
       recvcounts = (int *)malloc(ThisTask->nranks * sizeof(*recvcounts));
       assert(recvcounts != NULL);
@@ -570,56 +618,58 @@ void get_domains(MyData  **const buffer,
       assert(displs != NULL);
 
       for (int task=0 ; task<ThisTask->nranks ; task++)
-	{
-	  const int nrows = (boundaries[task].end[X] - boundaries[task].start[X] + 1);
-	  const int ncols = (boundaries[task].end[Y] - boundaries[task].start[Y] + 1);
-	  recvcounts[task] = (nrows * ncols);
-	}
+	recvcounts[task] = (boundaries[task].dim[X] * boundaries[task].dim[Y]);
 
       displs[0] = 0;
       for (int task=1 ; task<ThisTask->nranks ; task++)
 	displs[task] = (recvcounts[task - 1] + displs[task - 1]);
 
+      /* check if the size is the entire domain */
+      assert((displs[ThisTask->nranks - 1] + recvcounts[ThisTask->nranks - 1]) == (NX_GLOB * NY_GLOB));
+      
       /* allocate the recvbuffer */
-      recvbuf = (MyData *)malloc((displs[ThisTask->nranks - 1] + recvcounts[ThisTask->nranks - 1]) * sizeof(*recvbuf));
+      recvbuf = (MyData *)malloc(NX_GLOB * NY_GLOB * sizeof(*recvbuf));
       assert(recvbuf != NULL);
-    } /* MASTER */
+    } /* MASTERTASK */
 
   /* every process sets up the sendbuf */
-  const int nrows = (ThisTask->domain.end[X] - ThisTask->domain.start[X] + 1);
-  const int ncols = (ThisTask->domain.end[Y] - ThisTask->domain.start[Y] + 1);
-  sendbuf = (MyData *)malloc(nrows * ncols * sizeof(*sendbuf));
+  sendbuf = (MyData *)malloc(ThisTask->domain.dim[X] * ThisTask->domain.dim[Y] * sizeof(*sendbuf));
   assert(sendbuf != NULL);
   /* store the actual grid owned by each process into sendbuf */
-  for (int row=0 ; row<nrows ; row++)
-    memcpy(&sendbuf[row * ncols], &buffer[ThisTask->domain.start[X] + row][ThisTask->domain.start[Y]], (ncols * sizeof(MyData)));
+  for (int row=0 ; row<ThisTask->domain.dim[X] ; row++)
+    memcpy(&sendbuf[row * ThisTask->domain.dim[Y]], &buffer[ThisTask->domain.local_start[X] + row][ThisTask->domain.local_start[Y]], (ThisTask->domain.dim[Y] * sizeof(MyData)));
   
   /* perform the MPI collective */
-  MPI_Gatherv(sendbuf, (nrows * ncols), MPI_MyDatatype,
+  MPI_Gatherv(sendbuf, (ThisTask->domain.dim[X] * ThisTask->domain.dim[Y]), MPI_MyDatatype,
 	      recvbuf, recvcounts, displs, MPI_MyDatatype,
-	      MASTER, comm);
+	      MASTERTASK, ThisTask->comm2d);
 
-  /* store the recvbuf into the actual grid owned by the MASTER task */
-  if (ThisTask->rank == MASTER)
+  free(sendbuf);
+  if (ThisTask->rank == MASTERTASK)
     {
-      for (int task=1 ; task<ThisTask->nranks ; task++)
-	{
-	  const int nrows = (boundaries[task].end[X] - boundaries[task].start[X] + 1);
-	  const int ncols = (boundaries[task].end[Y] - boundaries[task].start[Y] + 1);
-	  for (int row=0 ; row<nrows ; row++)
-	    memcpy(&buffer[boundaries[task].start[X] + row][boundaries[task].start[Y]],
-		   &recvbuf[displs[task] + (row * ncols)], (ncols * sizeof(MyData)));
-	}
+      free(recvcounts);
     }
 
-  free(sendbuf);
-  if (ThisTask->rank == MASTER)
+  /* store the recvbuf into the global_grid allocated by the MASTER task */
+  if (ThisTask->rank == MASTERTASK)
     {
-      free(recvbuf);
+      global_phi = Allocate_2DdblArray(NX_GLOB + 2, NY_GLOB + 2);
+      assert(global_phi != NULL);
+
+      for (int task=0 ; task<ThisTask->nranks ; task++)
+	{
+	  for (int row=0 ; row<boundaries[task].dim[X] ; row++)
+	    {
+	      memcpy(&global_phi[boundaries[task].global_start[X] + row][boundaries[task].global_start[Y]],
+		     &recvbuf[displs[task] + (row * boundaries[task].dim[Y])],
+		     (boundaries[task].dim[Y] * sizeof(MyData)));
+	    }
+	}
+
       free(displs);
-      free(recvcounts);
+      free(recvbuf);
       free(boundaries);
-    }
+    } /* MASTERTASK */
   
   return;
 }
@@ -631,7 +681,7 @@ void WriteSolution(MyData **const phi,
 /*
 *********************************************************************** */
 {
-  const int ibeg = NGHOST;    
+  const int ibeg = NGHOST;
   const int jbeg = NGHOST;
   const int jend = jbeg + ny - 1;
     
diff --git a/jacobi/mpi/comp_comm/src/tools.c b/jacobi/mpi/comp_comm/src/tools.c
index 547d9a9..0e282ee 100644
--- a/jacobi/mpi/comp_comm/src/tools.c
+++ b/jacobi/mpi/comp_comm/src/tools.c
@@ -29,7 +29,7 @@ MyData **Allocate_2DdblArray(const int nx, const int ny)
 }
 
 /* ********************************************************************* */
-void Show_2DdblArray(const MyData **const A,
+void Show_2DdblArray(      MyData **const A,
 		     const int            nx,
 		     const int            ny,
 		     const char *const    string)
-- 
GitLab