mpi overlapping computation and communication added

26284f2f · David Goz · 997e7058 · 26284f2f · 26284f2f · 26284f2f
Commit 26284f2f authored 1 year ago by David Goz
--- a/.gitignore
+++ b/.gitignore
 # put here the files which are not going to be committed
 *.txt
 *~
+*.bin
\ No newline at end of file
--- a/jacobi/mpi/comp_comm/Makefile
+++ b/jacobi/mpi/comp_comm/Makefile
+#######################################################################
+# Author: David Goz (david.goz@inaf.it)                               #
+# June  2024                                                          #
+#######################################################################
+#
+# To see all the compilation options
+# $ make info
+#######################################################################
+
+# make.def defines how the application is compiled
+include make.def
+include make_mpi_path
+#######################################################################
+
+.PHONY: info mpi valgrind_memcheck valgrind_callgrind valgrind_cachegrind clean
+
+info:
+	@echo ' '
+	@echo '-----------------------------------------------------------------------------------------'
+	@echo '$$ make                     ---> compile the mpi application                             '
+	@echo '$$ make valgrind_memcheck   ---> run the mpi application using Valgrind under Memcheck   '
+	@echo '$$ make valgrind_callgrind  ---> run the mpi application using Valgrind under Callgrind  '
+	@echo '$$ make valgrind_cachegrind ---> run the mpi application using Valgrind under Cachegrind '
+	@echo '$$ make clean               ---> clean up all                                            '
+	@echo '$$ make info                ---> get make info                                           '
+	@echo '-----------------------------------------------------------------------------------------'
+	@echo ' '
+
+mpi: $(PROG)
+
+valgrind_memcheck: $(PROG_MEMCHECK)
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+	mpirun -n 4 valgrind --tool=memcheck -s --default-suppressions=yes --log-file=valgrind_memcheck_log_%p.txt ./$< 9 2
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+
+valgrind_callgrind: $(PROG_CALLGRIND)
+	@echo 'oooOOO... valgrind_callgrind ...OOOooo'
+	mpirun -n 4 valgrind --tool=callgrind --dump-instr=yes --collect-jumps=yes --log-file=valgrind_callgrind_log_.%p.txt ./$< 9 2
+	@echo ' '
+	@echo 'To generate a function-by-function summary from the profile data file:'
+	@echo '$$ callgrind_annotate --auto=yes callgrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+
+valgrind_cachegrind: $(PROG_CACHEGRIND)
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+	mpirun -n 4 valgrind --tool=cachegrind --log-file=valgrind_cachegrind_log_.%p.txt ./$< 9 2
+	@echo '$$ cg_annotate --auto=yes cachegrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+
+clean:
+	rm -f *~ .*~ ./src/*~ ./src/*# ./include/*~ ./include/*# *~
+	rm -f $(PROG) $(PROG_DEBUG) $(PROG_MEMCHECK) $(PROG_CALLGRIND) $(PROG_CACHEGRIND)
+	rm -f valgrind_*.txt
+	rm -f cachegrind.out.*
+	rm -f callgrind.*
+	rm -f *bin
--- a/jacobi/mpi/comp_comm/include/allvars.h
+++ b/jacobi/mpi/comp_comm/include/allvars.h
+#pragma once
+
+#define NGHOST   1
+#define NDIM     2 /* 2D */
+#define X        0
+#define Y        1
+#define TOL      1.e-5
+
+#include <mpi.h>
+
+#define MASTER 0
+
+#if defined(SINGLE_PRECISION)
+typedef float MyData;
+#define MPI_MyDatatype MPI_FLOAT_PRECISION
+#else
+/* double precision is assumed by default */
+typedef double MyData;
+#define MPI_MyDatatype MPI_DOUBLE_PRECISION
+#endif /* SINGLE_PRECISION */
--- a/jacobi/mpi/comp_comm/include/tools.h
+++ b/jacobi/mpi/comp_comm/include/tools.h
+#pragma once
+
+#include "allvars.h"
+#include <assert.h>
+#include <sys/time.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/* function prototypes */
+MyData **Allocate_2DdblArray(const int nx, const int ny);
+void Show_2DdblArray(const MyData **const A,
+                     const int            nx,
+                     const int            ny,
+                     const char *const    string);
+
+double seconds(void);
--- a/jacobi/mpi/comp_comm/make.def
+++ b/jacobi/mpi/comp_comm/make.def
+CC     ?= gcc
+CFLAGS ?= -Wall -Wextra -march=native
+LIBS   ?= -lm -lmpi
+
+SYSTYPE = $(strip $(shell uname -n))
+
+PROG           ?= jacobi_mpi_comp_comm_$(SYSTYPE)
+PROG_DEBUG      = $(PROG)_DEBUG
+PROG_MEMCHECK   = $(PROG)_MEMCHECK
+PROG_CALLGRIND  = $(PROG)_CALLGRIND
+PROG_CACHEGRIND = $(PROG)_CACHEGRIND
+
+HEADERS       = $(wildcard ./include/*.h)
+SOURCES       = $(wildcard ./src/*.c)
+DEPENDENCIES  = $(SOURCES) $(HEADERS) Makefile
+
+$(PROG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_DEBUG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -ggdb3 -fno-omit-frame-pointer -I./include I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_DEBUG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_MEMCHECK): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_MEMCHECK) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CALLGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CALLGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CACHEGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CACHEGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
--- a/jacobi/mpi/comp_comm/make_mpi_path
+++ b/jacobi/mpi/comp_comm/make_mpi_path
+# set the MPI install path
+
+MPI_INSTALL_PATH = /home/gozzilla/software/openmpi/openmpi-5.0.3
+
+
+
+MPI        = $(MPI_INSTALL_PATH)
+MPI_INC    = $(MPI)/include
+MPI_LIB    = $(MPI)/lib
\ No newline at end of file
--- a/jacobi/mpi/comp_comm/src/jacobi_2D_mpi_comp_comm.c
+++ b/jacobi/mpi/comp_comm/src/jacobi_2D_mpi_comp_comm.c
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "allvars.h"
+#include "tools.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+/* #define DEBUG */
+
+typedef struct MyGrid
+{
+  int start[NDIM];
+  int end[NDIM];
+} myDomain;
+
+
+typedef struct Task_2D_Cartesian
+{
+  int rank;
+  int nranks;
+  int coords[NDIM];
+  myDomain domain;
+  int nbrtop;
+  int nbrbottom;
+  int nbrleft;
+  int nbrright;
+} Task;
+
+/* function prototypes */
+void BoundaryConditions(MyData **const restrict,
+			MyData  *const restrict,
+			MyData  *const restrict,
+			const int,
+			const int);
+
+void JacobiAlgorithm(MyData **const restrict, MyData **const restrict, const MyData *restrict,
+		     const int, const int, const int, const int, MyData *const restrict);
+
+void Jacobi_Communication(MyData      **const restrict Phi,
+			  MyData      **const restrict Phi0,
+			  MyData       *const restrict error,
+			  const MyData *      restrict delta,
+			  const int                    grid_y,
+			  Task         *const restrict ThisTask,
+			  MPI_Comm     *const restrict comm);
+
+void get_domains(MyData  **const buffer,
+		 Task     *const ThisTask,
+		 const MPI_Comm  comm);
+
+void WriteSolution(MyData **const phi, const int nx, const int ny);
+
+int main(int argc, char **argv)
+{
+  int rank, Nranks;
+  const MPI_Comm comm = MPI_COMM_WORLD;
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &Nranks);
+
+  if ((argc <= 2) && (!rank))
+    {
+      printf("\n\t Usage: <executable> <grid_size> <cartesian communicator size_X>\n\n");
+      MPI_Abort(comm, EXIT_FAILURE);
+      exit(EXIT_FAILURE);
+    }
+
+  /* global X and Y grid size (square matrix supposed) */
+  const int NX_GLOB = (int) strtol(argv[1], NULL, 10);
+  const int NY_GLOB = NX_GLOB;
+
+  /********************************** Cartesin topology *******************************************************/
+  const int cartesian_grid_x = (int)strtol(argv[2], NULL, 10);
+  const int cartesian_grid_y = ((Nranks % cartesian_grid_x == 0) ? (Nranks / cartesian_grid_x) : -1);
+  if (cartesian_grid_y == -1)
+    {
+      if (!rank)
+        {
+          printf("\n\t Nranks mod cartesian_grid_x != 0 ... aborting ...\n");
+          MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
+          exit(EXIT_FAILURE);
+        }
+    }
+  
+  const int dims[NDIM] = {cartesian_grid_x, cartesian_grid_y};
+  const int periods[NDIM] = {0 ,0}; /* not periodic */
+  const int reorder = 0;            /* setting the logical value reorder to TRUE allows MPI to */
+                                    /* reorder the rank of the processes figuring out the      */
+                                    /* neighbors in the actual hardware for better performance */
+
+  /* make a new communicator to which topology information has been attached */
+  MPI_Comm comm2d = MPI_COMM_NULL;
+  MPI_Cart_create(MPI_COMM_WORLD, NDIM, dims, periods, reorder, &comm2d);
+  if (comm2d == MPI_COMM_NULL)
+    {
+      printf("\n\t Process %d is not part of the new communicator \n", rank);
+      fflush(stdout);
+      MPI_Abort(comm, EXIT_FAILURE);
+      exit(EXIT_FAILURE);
+    }
+
+  Task ThisTask;
+
+  /* get the comm size */
+  MPI_Comm_size(comm2d, &ThisTask.nranks);
+  
+  /* determine the process coords in cartesian topology given rank in group */
+  MPI_Cart_coords(comm2d, rank, NDIM, ThisTask.coords);
+
+  /* determines process rank in communicator given Cartesian location */
+  MPI_Cart_rank(comm2d, ThisTask.coords, &ThisTask.rank);
+  
+  /* get bottom and top neighbors (X direction) */
+  MPI_Cart_shift(comm2d, X, 1, &ThisTask.nbrbottom, &ThisTask.nbrtop);
+
+  /* get left and right neighbors (Y direction) */
+  MPI_Cart_shift(comm2d, Y, 1, &ThisTask.nbrleft, &ThisTask.nbrright);
+  
+  /************************************************************************************************************/
+  /* 2D MPI-cartesian decomposition:
+     the grids are replicated across the MPI processes.
+     This approach is not the most efficient in terms of memory usage,
+     because the arrays are replicated across MPI process instead of to be distributed */
+
+  /* get the reminder, i.e. take into account uneven
+     decomposition of points among the processes    */
+  const int rem[NDIM] = {(NX_GLOB + 2) % dims[X],
+                         (NY_GLOB + 2) % dims[Y]};
+
+  /* get the amount of data for each MPI process:
+     - chunk is supposed to be >= 1 */
+  const int chunk[NDIM] = {(NX_GLOB + 2 - rem[X]) / dims[X],
+                           (NY_GLOB + 2 - rem[Y]) / dims[Y]};
+  if ((chunk[X] < 1) || (chunk[Y] < 1))
+    {
+      printf("\n\t chunk[X] < 1 || chunk[Y] < 1 ... aborting ...[rank: %d]\n", rank);
+      MPI_Comm_free(&comm2d);
+      MPI_Abort(comm, EXIT_FAILURE);
+      exit(EXIT_FAILURE);
+    }
+  
+  /* get the subgrid dimension along X and Y directions: */
+  int incr[NDIM], offset[NDIM];
+  for (int dim=0 ; dim<NDIM ; dim++)
+    {
+      if (ThisTask.coords[dim] < rem[dim])
+	{
+	  incr[dim] = chunk[dim] + 1;
+	  offset[dim] = 0;
+	}
+      else
+	{
+	  incr[dim] = chunk[dim];
+	  offset[dim] = rem[dim];
+	}
+    }
+
+  /* subdomain managed by the task */
+  for (int dim=0 ; dim<NDIM ; dim++)
+    {
+      ThisTask.domain.start[dim] = ((ThisTask.coords[dim] * incr[dim]) + offset[dim]);
+      ThisTask.domain.end[dim]   = (ThisTask.domain.start[dim] + incr[dim]) - 1;
+
+      /* boundaries */
+      ThisTask.domain.start[dim] = ((ThisTask.domain.start[dim] == 0)         ? NGHOST  : ThisTask.domain.start[dim]);
+      ThisTask.domain.end[dim]   = ((ThisTask.domain.end[dim] == NX_GLOB + 1) ? NX_GLOB : ThisTask.domain.end[dim]);
+    }
+
+#if defined(DEBUG)
+  for (int task=0 ; task<ThisTask.nranks ; task++)
+    {
+      if (ThisTask.rank == task)
+	{
+	  printf("\n\t rank: %d", task);
+	  printf("\n\t\t coords = [%d, %d]", ThisTask.coords[X], ThisTask.coords[Y]);
+	  printf("\n\t\t domain.start[X] = %d - domain.end[X] = %d", ThisTask.domain.start[X], ThisTask.domain.end[X]);
+	  printf("\n\t\t domain.start[Y] = %d - domain.end[Y] = %d", ThisTask.domain.start[Y], ThisTask.domain.end[Y]);
+	  printf("\n\t\t nbrtop  = %d - nbrbottom = %d",  ThisTask.nbrtop,  ThisTask.nbrbottom);
+	  printf("\n\t\t nbrleft = %d - nbrright  = %d\n", ThisTask.nbrleft, ThisTask.nbrright);
+	  fflush(stdout);
+	}
+      MPI_Barrier(comm2d);
+    }
+#endif /* DEBUG */
+  
+  /******************************************************************************************************/
+  
+  const MyData xbeg = 0.0;
+  const MyData xend = 1.0;
+  const MyData ybeg = 0.0;
+  const MyData yend = 1.0;
+
+  const MyData delta[NDIM] = {(xend - xbeg)/(NX_GLOB + 1),
+			      (yend - ybeg)/(NY_GLOB + 1)};
+
+  /* --------------------------------------------------------
+     1. Set grid indices
+     -------------------------------------------------------- */
+    
+  const int ibeg   = NGHOST;
+  const int iend   = ibeg + NX_GLOB - 1;
+  const int nx     = iend - ibeg + 1;
+  const int nx_tot = nx + 2 * NGHOST;
+    
+  const int jbeg   = NGHOST;
+  const int jend   = jbeg + NY_GLOB - 1;
+  const int ny     = jend - jbeg + 1;
+  const int ny_tot = ny + 2 * NGHOST;
+
+  if (rank == MASTER)
+    {
+      printf("\n\t Grid indices:");
+      printf("\n\t\t ibeg, iend = %d, %d; nx_tot = %d"    ,ibeg, iend, nx_tot);
+      printf("\n\t\t jbeg, jend = %d, %d; ny_tot = %d\n\n",jbeg, jend, ny_tot);
+    }
+  
+  /* --------------------------------------------------------
+     2. Generate grid, allocate memory
+        Not optimized because the grids are (unnecessarily)
+	replicated across MPI processes
+     -------------------------------------------------------- */
+
+  /* memory allocation */
+  MyData *xg = (MyData *) malloc((NX_GLOB + 2*NGHOST) * sizeof(MyData));
+  MyData *yg = (MyData *) malloc((NY_GLOB + 2*NGHOST) * sizeof(MyData));
+  assert((xg != NULL) && (yg != NULL));
+
+  /* initial conditions */
+  for (int i=0 ; i<(NX_GLOB + 2*NGHOST) ; i++) xg[i] = xbeg + (i - ibeg + 1) * delta[X];
+  for (int j=0 ; j<(NY_GLOB + 2*NGHOST) ; j++) yg[j] = ybeg + (j - jbeg + 1) * delta[Y];
+  MyData *x = xg; /* Global and local grids are the same  */
+  MyData *y = yg; /* for serial version of the code       */
+
+  /* grids memory allocation */
+  MyData **phi  = Allocate_2DdblArray(ny_tot, nx_tot);
+  MyData **phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
+    
+  /* --------------------------------------------------------
+     3. Initialize solution array to 0
+     -------------------------------------------------------- */
+    
+  for (int j=jbeg ; j<=jend ; j++)
+    for (int i=ibeg ; i<=iend ; i++)
+      {
+	phi0[j][i] = 0.0;
+	phi[j][i]  = 0.0;
+      }
+  
+  /* --------------------------------------------------------
+     4. Main iteration cycle
+     -------------------------------------------------------- */
+
+  const double time_start = MPI_Wtime();
+  
+  /* -- 4a. Set boundary conditions first -- */  
+  BoundaryConditions(phi0, x, y, nx, ny);
+  BoundaryConditions(phi, x, y, nx, ny);
+
+  MyData err = 1.0;
+  /* iterations */
+  int k = 0;
+  while (1)
+    {
+      /* -- 4c. Jacobi's method and residual (interior points) -- */
+      /*       core algorithm                                     */
+
+      err = 0.0;
+      Jacobi_Communication(phi, phi0, &err, delta, NY_GLOB, &ThisTask, &comm2d);
+
+      if (!rank)
+	printf("\n\t Iteration = %d - err = %lg\n",k, err);
+
+      /* increase the counter of loop iterations */
+      k++;
+
+      /* get the total error */
+      MyData toterr;
+      /* combines values from all processes and distributes the result back to all processes */
+      MPI_Allreduce(&err, &toterr, 1, MPI_MyDatatype, MPI_SUM, comm2d);
+      
+      /* check convergence */
+      if (toterr <= TOL)
+	{
+	  /* master task gathers all the domains */
+	  get_domains(phi, &ThisTask, comm2d);
+
+	  break;
+	}
+
+      /* swap the pointers */
+      MyData **tmp = phi;
+      phi = phi0;
+      phi0 = tmp;
+    }
+
+  /* master rank writes the solution */
+  if (!rank)
+    {
+      WriteSolution(phi, nx, ny);
+      
+      printf("\n\t NX_GLOB x NY_GLOB = %d x %d\n", NX_GLOB, NY_GLOB);
+      printf("\n\t Time = %lf [s]\n\n", MPI_Wtime() - time_start);
+    }
+
+  // free memory
+  if (phi0)
+    {
+      free(phi0[0]);
+      free(phi0);
+    }
+
+  if (phi)
+    {
+      free(phi[0]);
+      free(phi);
+    }
+
+  if (yg)
+    free(yg);
+  
+  if (xg)
+    free(xg);
+
+  MPI_Comm_free(&comm2d);
+  
+  MPI_Finalize();
+  
+  return 0;
+}
+
+/* ********************************************************************* */
+void BoundaryConditions(MyData **const restrict phi,
+			MyData  *const restrict x,
+			MyData  *const restrict y,
+                        const int               nx,
+			const int               ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;
+  const int iend = ibeg + nx - 1;
+    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+
+  int i,j;
+  
+  /* -- Left -- */
+  i = ibeg - 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (1.0 - y[j]);
+    
+  /* -- Right -- */
+  i = jend + 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (y[j] * y[j]);
+    
+  /* -- Bottom -- */    
+  j = jbeg - 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = (1.0 - x[i]);
+    
+  /* -- Top -- */
+  j = jend + 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = x[i];
+
+  return;
+}
+
+/* ********************************************************************* */
+
+void JacobiAlgorithm(MyData **const restrict Phi,
+		     MyData **const restrict Phi0,
+		     const MyData  *restrict delta,
+                     const int               jbeg,
+		     const int               jend,
+		     const int               ibeg,
+		     const int               iend,
+		     MyData *const restrict  error)
+{
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      for (int i=ibeg ; i<=iend ; i++)
+	{
+	  Phi[j][i] = 0.25 * (Phi0[j][i-1] + Phi0[j][i+1] +
+			      Phi0[j-1][i] + Phi0[j+1][i]);
+                
+	  *error += delta[X] * delta[Y] * fabs(Phi[j][i] - Phi0[j][i]);
+	} /* loop over columns */
+    } /* loop over rows */
+
+  return;
+}
+
+void Jacobi_Communication(MyData      **const restrict Phi,
+			  MyData      **const restrict Phi0,
+			  MyData       *const restrict error,
+			  const MyData *      restrict delta,
+			  const int                    grid_y,
+			  Task         *const restrict ThisTask,
+			  MPI_Comm     *const restrict comm)
+{
+  /* custom datatype */
+  MPI_Datatype column;
+  MPI_Type_vector((ThisTask->domain.end[X] - ThisTask->domain.start[X] + 1), 1, grid_y + 2, MPI_MyDatatype, &column);
+  /* commits the datatype */
+  MPI_Type_commit(&column);
+
+  const int data_row_size = (ThisTask->domain.end[Y] - ThisTask->domain.start[Y] + 1);
+  
+  /* First task: issue the communication */
+  MPI_Request request[8];
+
+  MyData **const restrict buffer = Phi0;
+  
+  /* 4 nonblocking MPI_Irecv */
+  MPI_Irecv(&buffer[ThisTask->domain.start[X] - 1][ThisTask->domain.start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrbottom, 0, *comm, &request[0]);
+  MPI_Irecv(&buffer[ThisTask->domain.end[X]   + 1][ThisTask->domain.start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrtop,    1, *comm, &request[1]);
+  MPI_Irecv(&buffer[ThisTask->domain.start[X]    ][ThisTask->domain.start[Y] - 1], 1,             column,         ThisTask->nbrleft,   2, *comm, &request[2]);
+  MPI_Irecv(&buffer[ThisTask->domain.start[X]    ][ThisTask->domain.end[Y]   + 1], 1,             column,         ThisTask->nbrright,  3, *comm, &request[3]);
+
+  /* 4 nonblocking MPI_Isend */
+  MPI_Isend(&buffer[ThisTask->domain.end[X]      ][ThisTask->domain.start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrtop,    0, *comm, &request[4]);
+  MPI_Isend(&buffer[ThisTask->domain.start[X]    ][ThisTask->domain.start[Y]    ], data_row_size, MPI_MyDatatype, ThisTask->nbrbottom, 1, *comm, &request[5]);
+  MPI_Isend(&buffer[ThisTask->domain.start[X]    ][ThisTask->domain.end[Y]      ], 1,             column,         ThisTask->nbrright,  2, *comm, &request[6]);
+  MPI_Isend(&buffer[ThisTask->domain.start[X]    ][ThisTask->domain.start[Y]    ], 1,             column,         ThisTask->nbrleft,   3, *comm, &request[7]);
+
+  /**************************************** computation ****************************************/
+  /* perform the computation with the local data, (i.e. ghost cells are not required) */
+  /* so overlapping computation and communication */
+
+  const int jbeg = ThisTask->domain.start[X] + 1;
+  const int jend = ThisTask->domain.end[X]   - 1;
+  const int ibeg = ThisTask->domain.start[Y] + 1;
+  const int iend = ThisTask->domain.end[Y]   - 1;
+
+  *error = 0.0;
+  JacobiAlgorithm(Phi, Phi0, delta, jbeg, jend, ibeg, iend, error);
+  
+  /**********************************************************************************************/
+  
+  /* waits for any specified MPI Request to complete */
+  for (int req=0 ; req<8 ; req++)
+    {
+      int idx;
+      MPI_Status status;
+
+      MPI_Waitany(8, request, &idx, &status);
+
+      switch(status.MPI_TAG)
+	{
+	  /* communication with tag 0 completed */
+	case 0:
+	  /* data from nbrbottom have been received */
+	  JacobiAlgorithm(Phi, Phi0, delta, (jbeg - 1), (jbeg - 1), (ibeg - 1) , (iend + 1), error);
+	  break;
+
+	  /* communication with tag 1 completed */
+	case 1:
+	  /* data from nbrtop have been received */
+	  JacobiAlgorithm(Phi, Phi0, delta, (jend + 1), (jend + 1), (ibeg - 1) , (iend + 1), error);
+	  break;
+
+	  /* communication with tag 2 completed */
+	case 2:
+	  /* data from nbrleft have been received */
+	  JacobiAlgorithm(Phi, Phi0, delta, (jbeg - 1), (jend + 1), (ibeg - 1), (ibeg - 1), error);
+	  break;
+
+	  /* communication with tag 3 completed */
+	case 3:
+	  /* data from nbrright have been received */
+	  JacobiAlgorithm(Phi, Phi0, delta, (jbeg - 1), (jend + 1), (iend + 1), (iend + 1), error);
+	  break;
+	}
+    }  
+  
+  MPI_Type_free(&column);
+
+  return;
+}
+
+/* ********************************************************************* */
+
+void get_domains(MyData  **const buffer,
+		 Task     *const ThisTask,
+		 const MPI_Comm  comm)
+{
+  /* Master process gathers all the domains */
+  
+  /***************************** get the domain boundaries from each process *************************************/
+  myDomain *boundaries = NULL;
+  if (ThisTask->rank == MASTER)
+    {
+      boundaries = (myDomain *)malloc(ThisTask->nranks * sizeof(*boundaries));
+      assert(boundaries != NULL);
+    }
+
+  MPI_Gather(&ThisTask->domain, sizeof(myDomain), MPI_BYTE,
+	     boundaries,        sizeof(myDomain), MPI_BYTE,
+	     MASTER, comm);
+
+  /**************************************************************************************************/
+  
+  /*************************************** get the domain from each process *************************/
+  /* MPI_Gatherv
+     Gathers into specified locations from all processes in a group
+
+     Synopsis
+     int MPI_Gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                     void *recvbuf, const int recvcounts[], const int displs[],
+                     MPI_Datatype recvtype, int root, MPI_Comm comm) */
+
+  int *recvcounts = NULL;
+  int *displs = NULL;
+  MyData *recvbuf = NULL;
+  MyData *sendbuf = NULL;
+  
+  if (ThisTask->rank == MASTER)
+    {
+      recvcounts = (int *)malloc(ThisTask->nranks * sizeof(*recvcounts));
+      assert(recvcounts != NULL);
+
+      displs = (int *)malloc(ThisTask->nranks * sizeof(*displs));
+      assert(displs != NULL);
+
+      for (int task=0 ; task<ThisTask->nranks ; task++)
+	{
+	  const int nrows = (boundaries[task].end[X] - boundaries[task].start[X] + 1);
+	  const int ncols = (boundaries[task].end[Y] - boundaries[task].start[Y] + 1);
+	  recvcounts[task] = (nrows * ncols);
+	}
+
+      displs[0] = 0;
+      for (int task=1 ; task<ThisTask->nranks ; task++)
+	displs[task] = (recvcounts[task - 1] + displs[task - 1]);
+
+      /* allocate the recvbuffer */
+      recvbuf = (MyData *)malloc((displs[ThisTask->nranks - 1] + recvcounts[ThisTask->nranks - 1]) * sizeof(*recvbuf));
+      assert(recvbuf != NULL);
+    } /* MASTER */
+
+  /* every process sets up the sendbuf */
+  const int nrows = (ThisTask->domain.end[X] - ThisTask->domain.start[X] + 1);
+  const int ncols = (ThisTask->domain.end[Y] - ThisTask->domain.start[Y] + 1);
+  sendbuf = (MyData *)malloc(nrows * ncols * sizeof(*sendbuf));
+  assert(sendbuf != NULL);
+  /* store the actual grid owned by each process into sendbuf */
+  for (int row=0 ; row<nrows ; row++)
+    memcpy(&sendbuf[row * ncols], &buffer[ThisTask->domain.start[X] + row][ThisTask->domain.start[Y]], (ncols * sizeof(MyData)));
+  
+  /* perform the MPI collective */
+  MPI_Gatherv(sendbuf, (nrows * ncols), MPI_MyDatatype,
+	      recvbuf, recvcounts, displs, MPI_MyDatatype,
+	      MASTER, comm);
+
+  /* store the recvbuf into the actual grid owned by the MASTER task */
+  if (ThisTask->rank == MASTER)
+    {
+      for (int task=1 ; task<ThisTask->nranks ; task++)
+	{
+	  const int nrows = (boundaries[task].end[X] - boundaries[task].start[X] + 1);
+	  const int ncols = (boundaries[task].end[Y] - boundaries[task].start[Y] + 1);
+	  for (int row=0 ; row<nrows ; row++)
+	    memcpy(&buffer[boundaries[task].start[X] + row][boundaries[task].start[Y]],
+		   &recvbuf[displs[task] + (row * ncols)], (ncols * sizeof(MyData)));
+	}
+    }
+
+  free(sendbuf);
+  if (ThisTask->rank == MASTER)
+    {
+      free(recvbuf);
+      free(displs);
+      free(recvcounts);
+      free(boundaries);
+    }
+  
+  return;
+}
+
+/* ********************************************************************* */
+void WriteSolution(MyData **const phi,
+		   const int      nx,
+		   const int      ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+    
+  static int nfile = 0;  /* File counter */
+
+  char fname[128];
+  sprintf(fname,"jacobi_2D_mpi_comp_comm_%02d.bin", nfile);
+    
+  FILE *fp;
+  printf ("> Writing %s\n",fname);
+  fp = fopen(fname, "wb");
+
+  /* discard boundaies */
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      fwrite (phi[j] + ibeg, sizeof(MyData), nx, fp);
+    }
+    
+  nfile++;
+  fclose(fp);
+}
--- a/jacobi/mpi/comp_comm/src/tools.c
+++ b/jacobi/mpi/comp_comm/src/tools.c
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "tools.h"
+
+/* ********************************************************************* */
+MyData **Allocate_2DdblArray(const int nx, const int ny)
+/*
+ * Allocate memory for a double precision array with
+ * nx rows and ny columns
+ *********************************************************************** */
+{
+  MyData **buf = malloc(nx * sizeof(MyData *));
+  assert(buf != NULL);
+
+  buf[0] = (MyData *) malloc(nx * ny * sizeof(MyData));
+  assert(buf[0] != NULL);
+  
+  for (int j=1 ; j<nx ; j++)
+    buf[j] = buf[j-1] + ny;
+    
+  return buf;
+}
+
+/* ********************************************************************* */
+void Show_2DdblArray(const MyData **const A,
+		     const int            nx,
+		     const int            ny,
+		     const char *const    string)
+/* *********************************************************************** */
+{
+  printf ("%s\n",string);
+  printf ("------------------------------\n");
+  for (int i=0 ; i<nx ; i++)
+    {
+      for (int j=0 ; j<ny ; j++)
+	{
+	  printf ("%8.2f  ", A[i][j]);
+	}
+      printf ("\n");
+    }
+  printf ("------------------------------\n");
+}
+/* ********************************************************************* */
+
+double seconds()
+{
+  struct timeval tmp;
+  gettimeofday(&tmp, (struct timezone *)0);
+  double sec = tmp.tv_sec + ((double)tmp.tv_usec)/1000000.0;
+
+  return sec;
+}