diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..c18d596b8fd9d43974ccc340974174d7b0d22cbc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+# put here the files which are not going to be committed
+*.txt
+*~
diff --git a/jacobi/mpi/SendRecv/Makefile b/jacobi/mpi/SendRecv/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..4d38d38eabe9aa2bade6928c4395d23debcbd4e0
--- /dev/null
+++ b/jacobi/mpi/SendRecv/Makefile
@@ -0,0 +1,57 @@
+#######################################################################
+# Author: David Goz (david.goz@inaf.it)                               #
+# June  2024                                                          #
+#######################################################################
+#
+# To see all the compilation options
+# $ make info
+#######################################################################
+
+# make.def defines how the application is compiled
+include make.def
+include make_mpi_path
+#######################################################################
+
+.PHONY: info mpi valgrind_memcheck valgrind_callgrind valgrind_cachegrind clean
+
+info:
+	@echo ' '
+	@echo '-----------------------------------------------------------------------------------------'
+	@echo '$$ make                     ---> compile the mpi application                             '
+	@echo '$$ make valgrind_memcheck   ---> run the mpi application using Valgrind under Memcheck   '
+	@echo '$$ make valgrind_callgrind  ---> run the mpi application using Valgrind under Callgrind  '
+	@echo '$$ make valgrind_cachegrind ---> run the mpi application using Valgrind under Cachegrind '
+	@echo '$$ make clean               ---> clean up all                                            '
+	@echo '$$ make info                ---> get make info                                           '
+	@echo '-----------------------------------------------------------------------------------------'
+	@echo ' '
+
+mpi: $(PROG)
+
+valgrind_memcheck: $(PROG_MEMCHECK)
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+	mpirun -n 2 valgrind --tool=memcheck -s --default-suppressions=yes --log-file=valgrind_memcheck_log_%p.txt ./$< 5 5
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+
+valgrind_callgrind: $(PROG_CALLGRIND)
+	@echo 'oooOOO... valgrind_callgrind ...OOOooo'
+	mpirun -n 2 valgrind --tool=callgrind --dump-instr=yes --collect-jumps=yes --log-file=valgrind_callgrind_log_.%p.txt ./$< 128 128
+	@echo ' '
+	@echo 'To generate a function-by-function summary from the profile data file:'
+	@echo '$$ callgrind_annotate --auto=yes callgrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+
+valgrind_cachegrind: $(PROG_CACHEGRIND)
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+	mpirun -n 2 valgrind --tool=cachegrind --log-file=valgrind_cachegrind_log_.%p.txt ./$< 128 128
+	@echo '$$ cg_annotate --auto=yes cachegrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+
+clean:
+	rm -f *~ .*~ ./src/*~ ./src/*# ./include/*~ ./include/*# *~
+	rm -f $(PROG) $(PROG_DEBUG) $(PROG_MEMCHECK) $(PROG_CALLGRIND) $(PROG_CACHEGRIND)
+	rm -f valgrind_*.txt
+	rm -f cachegrind.out.*
+	rm -f callgrind.*
+	rm -f *bin
diff --git a/jacobi/mpi/SendRecv/include/allvars.h b/jacobi/mpi/SendRecv/include/allvars.h
new file mode 100644
index 0000000000000000000000000000000000000000..798e4e6a2f2a740b3ab5a1394ac85eef8da1255c
--- /dev/null
+++ b/jacobi/mpi/SendRecv/include/allvars.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#define NGHOST   1
+#define NDIM     2 /* 2D */
+#define X        0
+#define Y        1
+#define TOL      1.e-5
+
+#include <mpi.h>
+
+#define MASTER 0
+
+#if defined(SINGLE_PRECISION)
+typedef float MyData;
+#define MPI_MyDatatype MPI_FLOAT_PRECISION
+#else
+/* double precision is assumed by default */
+typedef double MyData;
+#define MPI_MyDatatype MPI_DOUBLE_PRECISION
+#endif /* SINGLE_PRECISION */
diff --git a/jacobi/mpi/SendRecv/include/tools.h b/jacobi/mpi/SendRecv/include/tools.h
new file mode 100644
index 0000000000000000000000000000000000000000..82859ec511710474547355ede3fc38cf33a92154
--- /dev/null
+++ b/jacobi/mpi/SendRecv/include/tools.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "allvars.h"
+#include <assert.h>
+#include <sys/time.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/* function prototypes */
+MyData **Allocate_2DdblArray(const int nx, const int ny);
+void Show_2DdblArray(const MyData **const A,
+                     const int            nx,
+                     const int            ny,
+                     const char *const    string);
+
+double seconds(void);
diff --git a/jacobi/mpi/SendRecv/make.def b/jacobi/mpi/SendRecv/make.def
new file mode 100644
index 0000000000000000000000000000000000000000..21f903d0773e74a9bb7cac39f1e98e4687617678
--- /dev/null
+++ b/jacobi/mpi/SendRecv/make.def
@@ -0,0 +1,45 @@
+CC     ?= gcc
+CFLAGS ?= -Wall -Wextra -march=native
+LIBS   ?= -lm -lmpi
+
+SYSTYPE = $(strip $(shell uname -n))
+
+PROG           ?= jacobi_mpi_SendRecv_$(SYSTYPE)
+PROG_DEBUG      = $(PROG)_DEBUG
+PROG_MEMCHECK   = $(PROG)_MEMCHECK
+PROG_CALLGRIND  = $(PROG)_CALLGRIND
+PROG_CACHEGRIND = $(PROG)_CACHEGRIND
+
+HEADERS       = $(wildcard ./include/*.h)
+SOURCES       = $(wildcard ./src/*.c)
+DEPENDENCIES  = $(SOURCES) $(HEADERS) Makefile
+
+$(PROG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_DEBUG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -ggdb3 -fno-omit-frame-pointer -I./include I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_DEBUG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_MEMCHECK): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_MEMCHECK) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CALLGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CALLGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CACHEGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CACHEGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
diff --git a/jacobi/mpi/SendRecv/make_mpi_path b/jacobi/mpi/SendRecv/make_mpi_path
new file mode 100644
index 0000000000000000000000000000000000000000..f2c3de9e0d1796034a518159f4c53a40e76919fc
--- /dev/null
+++ b/jacobi/mpi/SendRecv/make_mpi_path
@@ -0,0 +1,9 @@
+# set the MPI install path
+
+MPI_INSTALL_PATH = /home/gozzilla/software/openmpi/openmpi-5.0.3
+
+
+
+MPI        = $(MPI_INSTALL_PATH)
+MPI_INC    = $(MPI)/include
+MPI_LIB    = $(MPI)/lib
\ No newline at end of file
diff --git a/jacobi/mpi/SendRecv/src/jacobi_2D_mpi_sendrecv.c b/jacobi/mpi/SendRecv/src/jacobi_2D_mpi_sendrecv.c
new file mode 100644
index 0000000000000000000000000000000000000000..9c57bafb03ac4b795d242921ea13cb62fa36c956
--- /dev/null
+++ b/jacobi/mpi/SendRecv/src/jacobi_2D_mpi_sendrecv.c
@@ -0,0 +1,505 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "allvars.h"
+#include "tools.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+/* #define DEBUG */
+
+typedef struct MyRows
+{
+  int start;
+  int end;
+} myDomain;
+
+/* function prototypes */
+void BoundaryConditions(MyData **const restrict,
+			MyData  *const restrict,
+			MyData  *const restrict,
+			const int,
+			const int);
+
+void JacobiAlgorithm(MyData **const restrict, MyData **const restrict, MyData *const restrict,
+		     const MyData *const restrict, const int, const int, const int, const int);
+
+void get_domains(MyData  **const buffer,
+		 myDomain *const domain,
+		 const int       grid_dim,
+		 const int       rank,
+		 const int       nranks,
+		 const MPI_Comm  comm);
+
+void mpi_exchange_sendrecv_1d(MyData **const buffer,
+			      const int      n,
+			      const int      nbrtop,
+			      const int      nbrbottom,
+			      const int      start,
+			      const int      end,
+			      const MPI_Comm comm1d);
+
+void WriteSolution(MyData **const phi, const int nx, const int ny);
+
+void copy_grids(MyData **const restrict A,
+                MyData **const restrict B,
+                const int               xbeg,
+                const int               xend,
+                const int               ybeg,
+		const int               yend);
+
+int main(int argc, char **argv)
+{
+  int rank, Nranks;
+  const MPI_Comm comm = MPI_COMM_WORLD;
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &Nranks);
+
+  if ((argc <= 1) && !rank)
+    {
+      printf("\n\t Usage: <executable> <grid_size> \n\n");
+      MPI_Abort(comm, EXIT_FAILURE);
+      exit(EXIT_FAILURE);
+    }
+
+  /* global X and Y grid size (square matrix supposed) */
+  const int NX_GLOB = (int) strtol(argv[1], NULL, 10);
+  const int NY_GLOB = NX_GLOB;
+
+  /******************************************************************************************************/
+  /* 1D MPI-decomposition:
+     the physical domain is sliced into slabs along the vertical direction,
+     while the grids are replicated across the MPI processes.
+     This approach is not the most efficient in terms of memory usage,
+     because the arrays are replicated across MPI process instead of to be distributed */
+
+  /* get the reminder, i.e. take into account uneven
+     decomposition of points among the processes    */
+  const int rem = (NX_GLOB + 2) % Nranks;
+
+  /* get the amount of data for each MPI process:
+     - chunk is supposed to be >= 1 */
+  const int chunk = (NX_GLOB + 2 - rem) / Nranks;
+  if (chunk < 1)
+    {
+      printf("\n\t chunk < 1 ... aborting ...[rank: %d]\n", rank);
+      MPI_Abort(comm, EXIT_FAILURE);
+      exit(EXIT_FAILURE);
+    }
+  
+  /* get the slab dimension along vertical direction: */
+  int incr, offset;
+  if (rank < rem)
+    {
+      incr = chunk + 1;
+      offset = 0;
+    }
+  else
+    {
+      incr = chunk;
+      offset = rem;
+    }
+
+  myDomain domain;
+  domain.start = ((rank * incr) + offset);
+  domain.end   = (domain.start + incr) - 1;
+  /* boundaries */
+  domain.start = ((domain.start == 0)         ? NGHOST  : domain.start);
+  domain.end   = ((domain.end == NX_GLOB + 1) ? NX_GLOB : domain.end);
+
+  /* rank of the top process */
+  const int nbrtop    = (((rank + 1) >= Nranks) ? MPI_PROC_NULL : (rank + 1));
+  /* rank of the bottom process */
+  const int nbrbottom = (((rank - 1) < 0)       ? MPI_PROC_NULL : (rank - 1));
+
+  /******************************************************************************************************/
+  
+  const MyData xbeg = 0.0;
+  const MyData xend = 1.0;
+  const MyData ybeg = 0.0;
+  const MyData yend = 1.0;
+
+  const MyData delta[NDIM] = {(xend - xbeg)/(NX_GLOB + 1),
+			      (yend - ybeg)/(NY_GLOB + 1)};
+
+  /* --------------------------------------------------------
+     1. Set grid indices
+     -------------------------------------------------------- */
+    
+  const int ibeg   = NGHOST;
+  const int iend   = ibeg + NX_GLOB - 1;
+  const int nx     = iend - ibeg + 1;
+  const int nx_tot = nx + 2 * NGHOST;
+    
+  const int jbeg   = NGHOST;
+  const int jend   = jbeg + NY_GLOB - 1;
+  const int ny     = jend - jbeg + 1;
+  const int ny_tot = ny + 2 * NGHOST;
+
+  if (rank == MASTER)
+    {
+      printf("\n\t Grid indices:");
+      printf("\n\t\t ibeg, iend = %d, %d; nx_tot = %d"    ,ibeg, iend, nx_tot);
+      printf("\n\t\t jbeg, jend = %d, %d; ny_tot = %d\n\n",jbeg, jend, ny_tot);
+    }
+
+#if defined(DEBUG)
+  for (int task=0 ; task<Nranks ; task++)
+    {
+      MPI_Barrier(comm);
+      
+      if (task == rank)
+	{
+	  printf("\n\t rank: %d", rank);
+	  printf("\n\t\t domain.start: %d - domain.end: %d", domain.start, domain.end);
+	  printf("\n\t\t nbrtop: %d - nbrbottom: %d \n", nbrtop, nbrbottom);
+	  fflush(stdout);
+	}
+    }
+
+  MPI_Barrier(comm);
+#endif /* DEBUG */
+  
+  /* --------------------------------------------------------
+     2. Generate grid, allocate memory
+        Not optimized because the grids are (unnecessarily)
+	replicated across MPI processes
+     -------------------------------------------------------- */
+
+  /* memory allocation */
+  MyData *xg = (MyData *) malloc((NX_GLOB + 2*NGHOST) * sizeof(MyData));
+  MyData *yg = (MyData *) malloc((NY_GLOB + 2*NGHOST) * sizeof(MyData));
+  assert((xg != NULL) && (yg != NULL));
+
+  /* initial conditions */
+  for (int i=0 ; i<(NX_GLOB + 2*NGHOST) ; i++) xg[i] = xbeg + (i - ibeg + 1) * delta[X];
+  for (int j=0 ; j<(NY_GLOB + 2*NGHOST) ; j++) yg[j] = ybeg + (j - jbeg + 1) * delta[Y];
+  MyData *x = xg; /* Global and local grids are the same  */
+  MyData *y = yg; /* for serial version of the code       */
+
+  /* grids memory allocation */
+  MyData **phi  = Allocate_2DdblArray(ny_tot, nx_tot);
+  MyData **phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
+    
+  /* --------------------------------------------------------
+     3. Initialize solution array to 0
+     -------------------------------------------------------- */
+    
+  for (int j=jbeg ; j<=jend ; j++)
+    for (int i=ibeg ; i<=iend ; i++)
+      {
+	phi0[j][i] = 0.0;
+	phi[j][i]  = 0.0;
+      }
+  
+  /* --------------------------------------------------------
+     4. Main iteration cycle
+     -------------------------------------------------------- */
+
+  const double time_start = MPI_Wtime();
+  
+  /* -- 4a. Set boundary conditions first -- */  
+  BoundaryConditions(phi0, x, y, nx, ny);
+  BoundaryConditions(phi, x, y, nx, ny);
+
+  MyData err = 1.0;
+  /* iterations */
+  int k = 0;
+  while (1)
+    {      
+      /* -- 4c. Jacobi's method and residual (interior points) -- */
+      /*       core algorithm                                     */
+
+      err = 0.0;
+      JacobiAlgorithm(phi, phi0, &err, delta,
+		      ibeg, iend, domain.start, domain.end);
+
+      if (!rank)
+	printf("\n\t Iteration = %d - err = %lg\n",k, err);
+
+      /* increase the counter of loop iterations */
+      k++;
+
+      /* get the total error */
+      MyData toterr;
+      /* combines values from all processes and distributes the result back to all processes */
+      MPI_Allreduce(&err, &toterr, 1, MPI_MyDatatype, MPI_SUM, comm);
+      
+      /* check convergence */
+      if (toterr <= TOL)
+	{
+	  /* master task gathers all the domains */
+	  get_domains(phi, &domain, NY_GLOB, rank, Nranks, comm);
+
+	  break;
+	}
+
+      /* -- 4b. MPI communications */
+      mpi_exchange_sendrecv_1d(phi, NX_GLOB,
+			       nbrtop, nbrbottom,
+			       domain.start, domain.end, comm);
+
+      /* swap the pointers */
+      MyData **tmp = phi;
+      phi = phi0;
+      phi0 = tmp;
+    }
+
+  /* master rank writes the solution */
+  if (!rank)
+    {
+      WriteSolution(phi, nx, ny);
+      
+      printf("\n\t NX_GLOB x NY_GLOB = %d x %d\n", NX_GLOB, NY_GLOB);
+      printf("\n\t Time = %lf [s]\n\n", MPI_Wtime() - time_start);
+    }
+
+  // free memory
+  if (phi0)
+    {
+      free(phi0[0]);
+      free(phi0);
+    }
+
+  if (phi)
+    {
+      free(phi[0]);
+      free(phi);
+    }
+
+  if (yg)
+    free(yg);
+  
+  if (xg)
+    free(xg);
+
+  MPI_Finalize();
+  
+  return 0;
+}
+
+/* ********************************************************************* */
+void BoundaryConditions(MyData **const restrict phi,
+			MyData  *const restrict x,
+			MyData  *const restrict y,
+                        const int               nx,
+			const int               ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;
+  const int iend = ibeg + nx - 1;
+    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+
+  int i,j;
+  
+  /* -- Left -- */
+  i = ibeg - 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (1.0 - y[j]);
+    
+  /* -- Right -- */
+  i = jend + 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (y[j] * y[j]);
+    
+  /* -- Bottom -- */    
+  j = jbeg - 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = (1.0 - x[i]);
+    
+  /* -- Top -- */
+  j = jend + 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = x[i];
+
+  return;
+}
+
+/* ********************************************************************* */
+
+void JacobiAlgorithm(MyData      **const restrict Phi,
+		     MyData      **const restrict Phi0,
+		     MyData       *const restrict error,
+		     const MyData *const restrict delta,
+		     const int                    ibeg,
+		     const int                    iend,
+		     const int                    jbeg,
+		     const int                    jend)
+{
+  *error = 0.0;
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      for (int i=ibeg ; i<=iend ; i++)
+	{
+	  Phi[j][i] = 0.25 * (Phi0[j][i-1] + Phi0[j][i+1] +
+			      Phi0[j-1][i] + Phi0[j+1][i]);
+                
+	  *error += delta[X] * delta[Y] * fabs(Phi[j][i] - Phi0[j][i]);
+	} /* loop over columns */
+    } /* loop over rows */
+  
+  return;
+}
+
+/* ********************************************************************* */
+void mpi_exchange_sendrecv_1d(MyData **const buffer,
+			      const int      n,
+			      const int      nbrtop,
+			      const int      nbrbottom,
+			      const int      start,
+			      const int      end,
+			      const MPI_Comm comm1d)
+{
+  /* The function is called by each MPI rank      */
+  /* - nbrtop    is the MPI process with rank + 1 */
+  /* - nbrbottom is the MPI process with rank - 1 */
+
+  MPI_Sendrecv(&buffer[end][1],     n, MPI_MyDatatype, nbrtop,    0,
+	       &buffer[start-1][1], n, MPI_MyDatatype, nbrbottom, 0,
+	       comm1d, MPI_STATUS_IGNORE);
+
+  MPI_Sendrecv(&buffer[start][1],   n, MPI_MyDatatype, nbrbottom, 1,
+	       &buffer[end+1][1],   n, MPI_MyDatatype, nbrtop,    1,
+	       comm1d, MPI_STATUS_IGNORE);
+  
+  return;
+}
+
+/* ********************************************************************* */
+
+void get_domains(MyData  **const buffer,
+		 myDomain *const domain,
+		 const int       grid_dim,
+		 const int       rank,
+		 const int       nranks,
+		 const MPI_Comm  comm)
+{
+  /* Master process gathers all the domains */
+  
+  /***************************** get the domain boundaries from each process *************************************/
+  myDomain *boundaries = NULL;
+  if (rank == MASTER)
+    {
+      boundaries = (myDomain *)malloc(nranks * sizeof(*boundaries));
+      if (boundaries == NULL)
+	{
+	  MPI_Abort(comm, EXIT_FAILURE);
+	  exit(EXIT_FAILURE);
+	}
+      boundaries[MASTER].start = domain->start;
+      boundaries[MASTER].end   = domain->end;
+    }
+
+  for (int task=0 ; task<nranks ; task++)
+    {
+      if (rank == MASTER)
+	{
+	  if (task)
+	    {
+	      MPI_Status status;
+	      MPI_Recv((void *)&boundaries[task], sizeof(myDomain), MPI_BYTE, task, 0, comm, &status);
+	      if ((status.MPI_ERROR != MPI_SUCCESS) || (status.MPI_SOURCE != task))
+		{
+		  free(boundaries);
+		  MPI_Abort(comm, EXIT_FAILURE);
+		  exit(EXIT_FAILURE);
+		}
+	    }
+#if defined(DEBUG)
+	  printf("\n\t Diplacements[%d].start = %d - Diplacements[%d].end = %d",
+		 task, boundaries[task].start, task, boundaries[task].end);
+#endif /* DEBUG */	  
+	} /* MASTER */
+      else if (task == rank)
+	{
+	  MPI_Send((void *)domain, sizeof(myDomain), MPI_BYTE, MASTER, 0, comm);
+	}
+    } /* loop over nranks */
+
+#if defined(DEBUG)
+  if (rank == MASTER)
+    {
+      printf("\n");
+      fflush(stdout);
+    }
+#endif /* DEBUG */
+
+  /**************************************************************************************************/
+
+  /***************************************** get the domain from each process *************************/
+
+  for (int task=0 ; task<nranks ; task++)
+    {
+      if (rank == MASTER)
+	{
+	  if (task)
+	    {
+	      MPI_Status status;
+	      /* number of grid points to receive (including ghost points) */
+	      const int nrows = (boundaries[task].end - boundaries[task].start + 1);
+	      const int elements = (nrows * (grid_dim + 2));
+	      MPI_Recv((void *)&buffer[boundaries[task].start][0], elements, MPI_MyDatatype, task, 0, comm, &status);
+	      if ((status.MPI_ERROR != MPI_SUCCESS) || (status.MPI_SOURCE != task))
+		{
+		  free(boundaries);
+		  MPI_Abort(comm, EXIT_FAILURE);
+		  exit(EXIT_FAILURE);
+		}
+	    }
+	} /* MASTER */
+      else if (task == rank)
+	{
+	  const int nrows = (domain->end - domain->start + 1);
+	  const int elements = (nrows * (grid_dim + 2));
+	  MPI_Send((void *)&buffer[domain->start][0], elements, MPI_MyDatatype, MASTER, 0, comm);
+	}
+    } /* loop over nranks */  
+  
+  /***************************************************************************************************/
+  
+  if (rank == MASTER)
+    free(boundaries);
+  
+  return;
+}
+
+/* ********************************************************************* */
+void WriteSolution(MyData **const phi,
+		   const int      nx,
+		   const int      ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+    
+  static int nfile = 0;  /* File counter */
+
+  char fname[32];
+  sprintf(fname,"jacobi2D_mpi_sendrecv_%02d.bin", nfile);
+    
+  FILE *fp;
+  printf ("> Writing %s\n",fname);
+  fp = fopen(fname, "wb");
+
+  /* discard boundaies */
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      fwrite (phi[j] + ibeg, sizeof(MyData), nx, fp);
+    }
+    
+  nfile++;
+  fclose(fp);
+}
diff --git a/jacobi/mpi/SendRecv/src/tools.c b/jacobi/mpi/SendRecv/src/tools.c
new file mode 100644
index 0000000000000000000000000000000000000000..be2761d170c23fde4d5d4aeda54af00e56abf3ce
--- /dev/null
+++ b/jacobi/mpi/SendRecv/src/tools.c
@@ -0,0 +1,59 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "tools.h"
+
+/* ********************************************************************* */
+MyData **Allocate_2DdblArray(const int nx, const int ny)
+/*
+ * Allocate memory for a double precision array with
+ * nx rows and ny columns
+ *********************************************************************** */
+{
+  MyData **buf = malloc(nx * sizeof(MyData *));
+  assert(buf != NULL);
+
+  buf[0] = (MyData *) malloc(nx * ny * sizeof(MyData));
+  assert(buf[0] != NULL);
+  
+  for (int j=1 ; j<nx ; j++)
+    buf[j] = buf[j-1] + ny;
+    
+  return buf;
+}
+
+/* ********************************************************************* */
+void Show_2DdblArray(const MyData **const A,
+		     const int            nx,
+		     const int            ny,
+		     const char *const    string)
+/* *********************************************************************** */
+{
+  printf ("%s\n",string);
+  printf ("------------------------------\n");
+  for (int i=0 ; i<nx ; i++)
+    {
+      for (int j=0 ; j<ny ; j++)
+	{
+	  printf ("%8.2f  ", A[i][j]);
+	}
+      printf ("\n");
+    }
+  printf ("------------------------------\n");
+}
+/* ********************************************************************* */
+
+double seconds()
+{
+  struct timeval tmp;
+  gettimeofday(&tmp, (struct timezone *)0);
+  double sec = tmp.tv_sec + ((double)tmp.tv_usec)/1000000.0;
+
+  return sec;
+}
diff --git a/jacobi/mpi/Send_Recv_blocking/Makefile b/jacobi/mpi/Send_Recv_blocking/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..4d38d38eabe9aa2bade6928c4395d23debcbd4e0
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_blocking/Makefile
@@ -0,0 +1,57 @@
+#######################################################################
+# Author: David Goz (david.goz@inaf.it)                               #
+# June  2024                                                          #
+#######################################################################
+#
+# To see all the compilation options
+# $ make info
+#######################################################################
+
+# make.def defines how the application is compiled
+include make.def
+include make_mpi_path
+#######################################################################
+
+.PHONY: info mpi valgrind_memcheck valgrind_callgrind valgrind_cachegrind clean
+
+info:
+	@echo ' '
+	@echo '-----------------------------------------------------------------------------------------'
+	@echo '$$ make                     ---> compile the mpi application                             '
+	@echo '$$ make valgrind_memcheck   ---> run the mpi application using Valgrind under Memcheck   '
+	@echo '$$ make valgrind_callgrind  ---> run the mpi application using Valgrind under Callgrind  '
+	@echo '$$ make valgrind_cachegrind ---> run the mpi application using Valgrind under Cachegrind '
+	@echo '$$ make clean               ---> clean up all                                            '
+	@echo '$$ make info                ---> get make info                                           '
+	@echo '-----------------------------------------------------------------------------------------'
+	@echo ' '
+
+mpi: $(PROG)
+
+valgrind_memcheck: $(PROG_MEMCHECK)
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+	mpirun -n 2 valgrind --tool=memcheck -s --default-suppressions=yes --log-file=valgrind_memcheck_log_%p.txt ./$< 5 5
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+
+valgrind_callgrind: $(PROG_CALLGRIND)
+	@echo 'oooOOO... valgrind_callgrind ...OOOooo'
+	mpirun -n 2 valgrind --tool=callgrind --dump-instr=yes --collect-jumps=yes --log-file=valgrind_callgrind_log_.%p.txt ./$< 128 128
+	@echo ' '
+	@echo 'To generate a function-by-function summary from the profile data file:'
+	@echo '$$ callgrind_annotate --auto=yes callgrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+
+valgrind_cachegrind: $(PROG_CACHEGRIND)
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+	mpirun -n 2 valgrind --tool=cachegrind --log-file=valgrind_cachegrind_log_.%p.txt ./$< 128 128
+	@echo '$$ cg_annotate --auto=yes cachegrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+
+clean:
+	rm -f *~ .*~ ./src/*~ ./src/*# ./include/*~ ./include/*# *~
+	rm -f $(PROG) $(PROG_DEBUG) $(PROG_MEMCHECK) $(PROG_CALLGRIND) $(PROG_CACHEGRIND)
+	rm -f valgrind_*.txt
+	rm -f cachegrind.out.*
+	rm -f callgrind.*
+	rm -f *bin
diff --git a/jacobi/mpi/Send_Recv_blocking/include/allvars.h b/jacobi/mpi/Send_Recv_blocking/include/allvars.h
new file mode 100644
index 0000000000000000000000000000000000000000..798e4e6a2f2a740b3ab5a1394ac85eef8da1255c
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_blocking/include/allvars.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#define NGHOST   1
+#define NDIM     2 /* 2D */
+#define X        0
+#define Y        1
+#define TOL      1.e-5
+
+#include <mpi.h>
+
+#define MASTER 0
+
+#if defined(SINGLE_PRECISION)
+typedef float MyData;
+#define MPI_MyDatatype MPI_FLOAT_PRECISION
+#else
+/* double precision is assumed by default */
+typedef double MyData;
+#define MPI_MyDatatype MPI_DOUBLE_PRECISION
+#endif /* SINGLE_PRECISION */
diff --git a/jacobi/mpi/Send_Recv_blocking/include/tools.h b/jacobi/mpi/Send_Recv_blocking/include/tools.h
new file mode 100644
index 0000000000000000000000000000000000000000..82859ec511710474547355ede3fc38cf33a92154
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_blocking/include/tools.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "allvars.h"
+#include <assert.h>
+#include <sys/time.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/* function prototypes */
+MyData **Allocate_2DdblArray(const int nx, const int ny);
+void Show_2DdblArray(const MyData **const A,
+                     const int            nx,
+                     const int            ny,
+                     const char *const    string);
+
+double seconds(void);
diff --git a/jacobi/mpi/Send_Recv_blocking/make.def b/jacobi/mpi/Send_Recv_blocking/make.def
new file mode 100644
index 0000000000000000000000000000000000000000..27810e9c27a0cb14d1f42186b83e983f3beea667
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_blocking/make.def
@@ -0,0 +1,45 @@
+CC     ?= gcc
+CFLAGS ?= -Wall -Wextra -march=native
+LIBS   ?= -lm -lmpi
+
+SYSTYPE = $(strip $(shell uname -n))
+
+PROG           ?= jacobi_mpi_Send_Recv_blocking_$(SYSTYPE)
+PROG_DEBUG      = $(PROG)_DEBUG
+PROG_MEMCHECK   = $(PROG)_MEMCHECK
+PROG_CALLGRIND  = $(PROG)_CALLGRIND
+PROG_CACHEGRIND = $(PROG)_CACHEGRIND
+
+HEADERS       = $(wildcard ./include/*.h)
+SOURCES       = $(wildcard ./src/*.c)
+DEPENDENCIES  = $(SOURCES) $(HEADERS) Makefile
+
+$(PROG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_DEBUG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -ggdb3 -fno-omit-frame-pointer -I./include I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_DEBUG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_MEMCHECK): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_MEMCHECK) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CALLGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CALLGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CACHEGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CACHEGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
diff --git a/jacobi/mpi/Send_Recv_blocking/make_mpi_path b/jacobi/mpi/Send_Recv_blocking/make_mpi_path
new file mode 100644
index 0000000000000000000000000000000000000000..f2c3de9e0d1796034a518159f4c53a40e76919fc
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_blocking/make_mpi_path
@@ -0,0 +1,9 @@
+# set the MPI install path
+
+MPI_INSTALL_PATH = /home/gozzilla/software/openmpi/openmpi-5.0.3
+
+
+
+MPI        = $(MPI_INSTALL_PATH)
+MPI_INC    = $(MPI)/include
+MPI_LIB    = $(MPI)/lib
\ No newline at end of file
diff --git a/jacobi/mpi/Send_Recv_blocking/src/jacobi_2D_mpi_send_recv_blocking.c b/jacobi/mpi/Send_Recv_blocking/src/jacobi_2D_mpi_send_recv_blocking.c
new file mode 100644
index 0000000000000000000000000000000000000000..936e6636aacf3925d610c11012c1122c13a476c4
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_blocking/src/jacobi_2D_mpi_send_recv_blocking.c
@@ -0,0 +1,515 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "allvars.h"
+#include "tools.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+/* #define DEBUG */
+
+typedef struct MyRows
+{
+  int start;
+  int end;
+} myDomain;
+
+/* function prototypes */
+void BoundaryConditions(MyData **const restrict,
+			MyData  *const restrict,
+			MyData  *const restrict,
+			const int,
+			const int);
+
+void JacobiAlgorithm(MyData **const restrict, MyData **const restrict, MyData *const restrict,
+		     const MyData *const restrict, const int, const int, const int, const int);
+
+void get_domains(MyData  **const buffer,
+		 myDomain *const domain,
+		 const int       grid_dim,
+		 const int       rank,
+		 const int       nranks,
+		 const MPI_Comm  comm);
+
+void mpi_exchange_1d(MyData **const buffer,
+		     const int      n,
+		     const int      nbrtop,
+		     const int      nbrbottom,
+		     const int      start,
+		     const int      end,
+		     const MPI_Comm comm1d);
+
+void WriteSolution(MyData **const phi, const int nx, const int ny);
+
+void copy_grids(MyData **const restrict A,
+                MyData **const restrict B,
+                const int               xbeg,
+                const int               xend,
+                const int               ybeg,
+		const int               yend);
+
+int main(int argc, char **argv)
+{
+  int rank, Nranks;
+  const MPI_Comm comm = MPI_COMM_WORLD;
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &Nranks);
+
+  if ((argc <= 1) && !rank)
+    {
+      printf("\n\t Usage: <executable> <grid_size> \n\n");
+      MPI_Abort(comm, EXIT_FAILURE);
+      exit(EXIT_FAILURE);
+    }
+
+  /* global X and Y grid size (square matrix supposed) */
+  const int NX_GLOB = (int) strtol(argv[1], NULL, 10);
+  const int NY_GLOB = NX_GLOB;
+
+  /******************************************************************************************************/
+  /* 1D MPI-decomposition:
+     the physical domain is sliced into slabs along the vertical direction,
+     while the grids are replicated across the MPI processes.
+     This approach is not the most efficient in terms of memory usage,
+     because the arrays are replicated across MPI process instead of to be distributed */
+
+  /* get the reminder, i.e. take into account uneven
+     decomposition of points among the processes    */
+  const int rem = (NX_GLOB + 2) % Nranks;
+
+  /* get the amount of data for each MPI process:
+     - chunk is supposed to be >= 1 */
+  const int chunk = (NX_GLOB + 2 - rem) / Nranks;
+  if (chunk < 1)
+    {
+      printf("\n\t chunk < 1 ... aborting ...[rank: %d]\n", rank);
+      MPI_Abort(comm, EXIT_FAILURE);
+      exit(EXIT_FAILURE);
+    }
+  
+  /* get the slab dimension along vertical direction: */
+  int incr, offset;
+  if (rank < rem)
+    {
+      incr = chunk + 1;
+      offset = 0;
+    }
+  else
+    {
+      incr = chunk;
+      offset = rem;
+    }
+
+  myDomain domain;
+  domain.start = ((rank * incr) + offset);
+  domain.end   = (domain.start + incr) - 1;
+  /* boundaries */
+  domain.start = ((domain.start == 0)         ? NGHOST  : domain.start);
+  domain.end   = ((domain.end == NX_GLOB + 1) ? NX_GLOB : domain.end);
+
+  /* rank of the top process */
+  const int nbrtop    = (((rank + 1) >= Nranks) ? MPI_PROC_NULL : (rank + 1));
+  /* rank of the bottom process */
+  const int nbrbottom = (((rank - 1) < 0)       ? MPI_PROC_NULL : (rank - 1));
+
+  /******************************************************************************************************/
+  
+  const MyData xbeg = 0.0;
+  const MyData xend = 1.0;
+  const MyData ybeg = 0.0;
+  const MyData yend = 1.0;
+
+  const MyData delta[NDIM] = {(xend - xbeg)/(NX_GLOB + 1),
+			      (yend - ybeg)/(NY_GLOB + 1)};
+
+  /* --------------------------------------------------------
+     1. Set grid indices
+     -------------------------------------------------------- */
+    
+  const int ibeg   = NGHOST;
+  const int iend   = ibeg + NX_GLOB - 1;
+  const int nx     = iend - ibeg + 1;
+  const int nx_tot = nx + 2 * NGHOST;
+    
+  const int jbeg   = NGHOST;
+  const int jend   = jbeg + NY_GLOB - 1;
+  const int ny     = jend - jbeg + 1;
+  const int ny_tot = ny + 2 * NGHOST;
+
+  if (rank == MASTER)
+    {
+      printf("\n\t Grid indices:");
+      printf("\n\t\t ibeg, iend = %d, %d; nx_tot = %d"    ,ibeg, iend, nx_tot);
+      printf("\n\t\t jbeg, jend = %d, %d; ny_tot = %d\n\n",jbeg, jend, ny_tot);
+    }
+
+#if defined(DEBUG)
+  for (int task=0 ; task<Nranks ; task++)
+    {
+      MPI_Barrier(comm);
+      
+      if (task == rank)
+	{
+	  printf("\n\t rank: %d", rank);
+	  printf("\n\t\t domain.start: %d - domain.end: %d", domain.start, domain.end);
+	  printf("\n\t\t nbrtop: %d - nbrbottom: %d \n", nbrtop, nbrbottom);
+	  fflush(stdout);
+	}
+    }
+
+  MPI_Barrier(comm);
+#endif /* DEBUG */
+  
+  /* --------------------------------------------------------
+     2. Generate grid, allocate memory
+        Not optimized because the grids are (unnecessarily)
+	replicated across MPI processes
+     -------------------------------------------------------- */
+
+  /* memory allocation */
+  MyData *xg = (MyData *) malloc((NX_GLOB + 2*NGHOST) * sizeof(MyData));
+  MyData *yg = (MyData *) malloc((NY_GLOB + 2*NGHOST) * sizeof(MyData));
+  assert((xg != NULL) && (yg != NULL));
+
+  /* initial conditions */
+  for (int i=0 ; i<(NX_GLOB + 2*NGHOST) ; i++) xg[i] = xbeg + (i - ibeg + 1) * delta[X];
+  for (int j=0 ; j<(NY_GLOB + 2*NGHOST) ; j++) yg[j] = ybeg + (j - jbeg + 1) * delta[Y];
+  MyData *x = xg; /* Global and local grids are the same  */
+  MyData *y = yg; /* for serial version of the code       */
+
+  /* grids memory allocation */
+  MyData **phi  = Allocate_2DdblArray(ny_tot, nx_tot);
+  MyData **phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
+    
+  /* --------------------------------------------------------
+     3. Initialize solution array to 0
+     -------------------------------------------------------- */
+    
+  for (int j=jbeg ; j<=jend ; j++)
+    for (int i=ibeg ; i<=iend ; i++)
+      {
+	phi0[j][i] = 0.0;
+	phi[j][i]  = 0.0;
+      }
+  
+  /* --------------------------------------------------------
+     4. Main iteration cycle
+     -------------------------------------------------------- */
+
+  const double time_start = MPI_Wtime();
+  
+  /* -- 4a. Set boundary conditions first -- */  
+  BoundaryConditions(phi0, x, y, nx, ny);
+  BoundaryConditions(phi, x, y, nx, ny);
+
+  MyData err = 1.0;
+  /* iterations */
+  int k = 0;
+  while (1)
+    {      
+      /* -- 4c. Jacobi's method and residual (interior points) -- */
+      /*       core algorithm                                     */
+
+      err = 0.0;
+      JacobiAlgorithm(phi, phi0, &err, delta,
+		      ibeg, iend, domain.start, domain.end);
+
+      if (!rank)
+	printf("\n\t Iteration = %d - err = %lg\n",k, err);
+
+      /* increase the counter of loop iterations */
+      k++;
+
+      /* get the total error */
+      MyData toterr;
+      /* combines values from all processes and distributes the result back to all processes */
+      MPI_Allreduce(&err, &toterr, 1, MPI_MyDatatype, MPI_SUM, comm);
+      
+      /* check convergence */
+      if (toterr <= TOL)
+	{
+	  /* master task gathers all the domains */
+	  get_domains(phi, &domain, NY_GLOB, rank, Nranks, comm);
+
+	  break;
+	}
+
+      /* -- 4b. MPI communications */
+      mpi_exchange_1d(phi, NX_GLOB,
+		      nbrtop, nbrbottom,
+		      domain.start, domain.end, comm);
+
+      /* swap the pointers */
+      MyData **tmp = phi;
+      phi = phi0;
+      phi0 = tmp;
+    }
+
+  /* master rank writes the solution */
+  if (!rank)
+    {
+      WriteSolution(phi, nx, ny);
+      
+      printf("\n\t NX_GLOB x NY_GLOB = %d x %d\n", NX_GLOB, NY_GLOB);
+      printf("\n\t Time = %lf [s]\n\n", MPI_Wtime() - time_start);
+    }
+
+  // free memory
+  if (phi0)
+    {
+      free(phi0[0]);
+      free(phi0);
+    }
+
+  if (phi)
+    {
+      free(phi[0]);
+      free(phi);
+    }
+
+  if (yg)
+    free(yg);
+  
+  if (xg)
+    free(xg);
+
+  MPI_Finalize();
+  
+  return 0;
+}
+
+/* ********************************************************************* */
+void BoundaryConditions(MyData **const restrict phi,
+			MyData  *const restrict x,
+			MyData  *const restrict y,
+                        const int               nx,
+			const int               ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;
+  const int iend = ibeg + nx - 1;
+    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+
+  int i,j;
+  
+  /* -- Left -- */
+  i = ibeg - 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (1.0 - y[j]);
+    
+  /* -- Right -- */
+  i = jend + 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (y[j] * y[j]);
+    
+  /* -- Bottom -- */    
+  j = jbeg - 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = (1.0 - x[i]);
+    
+  /* -- Top -- */
+  j = jend + 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = x[i];
+
+  return;
+}
+
+/* ********************************************************************* */
+
+void JacobiAlgorithm(MyData      **const restrict Phi,
+		     MyData      **const restrict Phi0,
+		     MyData       *const restrict error,
+		     const MyData *const restrict delta,
+		     const int                    ibeg,
+		     const int                    iend,
+		     const int                    jbeg,
+		     const int                    jend)
+{
+  *error = 0.0;
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      for (int i=ibeg ; i<=iend ; i++)
+	{
+	  Phi[j][i] = 0.25 * (Phi0[j][i-1] + Phi0[j][i+1] +
+			      Phi0[j-1][i] + Phi0[j+1][i]);
+                
+	  *error += delta[X] * delta[Y] * fabs(Phi[j][i] - Phi0[j][i]);
+	} /* loop over columns */
+    } /* loop over rows */
+  
+  return;
+}
+
+/* ********************************************************************* */
+void mpi_exchange_1d(MyData **const buffer,
+		     const int      n,
+		     const int      nbrtop,
+		     const int      nbrbottom,
+		     const int      start,
+		     const int      end,
+		     const MPI_Comm comm1d)
+{
+  /* The function is called by each MPI rank
+     - nbrtop is the MPI process with rank + 1
+     - nbrbottom is the MPI process with rank - 1 */
+
+  /***************** First communication stage *******************/
+  /* Perform a blocking send to the top (rank+1) process */
+  MPI_Send(&buffer[end][1], n, MPI_MyDatatype, nbrtop, 0, comm1d);
+
+  /* Perform a blocking receive from the bottom (rank-1) process */
+  MPI_Recv(&buffer[start-1][1], n, MPI_MyDatatype,
+	   nbrbottom, 0, comm1d, MPI_STATUS_IGNORE);
+  /***************************************************************/
+
+  /**************** Second communication stage *******************/
+  /* - Perform a blocking send to the bottom (rank-1) process */
+  MPI_Send(&buffer[start][1], n, MPI_MyDatatype, nbrbottom, 1, comm1d);
+
+  /* Perform a blocking receive from the top (rank+1) process */
+  MPI_Recv(&buffer[end+1][1], n, MPI_MyDatatype,
+	   nbrtop, 1, comm1d, MPI_STATUS_IGNORE);
+  /***************************************************************/
+  
+  return;
+}
+
+/* ********************************************************************* */
+
+void get_domains(MyData  **const buffer,
+		 myDomain *const domain,
+		 const int       grid_dim,
+		 const int       rank,
+		 const int       nranks,
+		 const MPI_Comm  comm)
+{
+  /* Master process gathers all the domains */
+  
+  /***************************** get the domain boundaries from each process *************************************/
+  myDomain *boundaries = NULL;
+  if (rank == MASTER)
+    {
+      boundaries = (myDomain *)malloc(nranks * sizeof(*boundaries));
+      if (boundaries == NULL)
+	{
+	  MPI_Abort(comm, EXIT_FAILURE);
+	  exit(EXIT_FAILURE);
+	}
+      boundaries[MASTER].start = domain->start;
+      boundaries[MASTER].end   = domain->end;
+    }
+
+  for (int task=0 ; task<nranks ; task++)
+    {
+      if (rank == MASTER)
+	{
+	  if (task)
+	    {
+	      MPI_Status status;
+	      MPI_Recv((void *)&boundaries[task], sizeof(myDomain), MPI_BYTE, task, 0, comm, &status);
+	      if ((status.MPI_ERROR != MPI_SUCCESS) || (status.MPI_SOURCE != task))
+		{
+		  free(boundaries);
+		  MPI_Abort(comm, EXIT_FAILURE);
+		  exit(EXIT_FAILURE);
+		}
+	    }
+#if defined(DEBUG)
+	  printf("\n\t Diplacements[%d].start = %d - Diplacements[%d].end = %d",
+		 task, boundaries[task].start, task, boundaries[task].end);
+#endif /* DEBUG */	  
+	} /* MASTER */
+      else if (task == rank)
+	{
+	  MPI_Send((void *)domain, sizeof(myDomain), MPI_BYTE, MASTER, 0, comm);
+	}
+    } /* loop over nranks */
+
+#if defined(DEBUG)
+  if (rank == MASTER)
+    {
+      printf("\n");
+      fflush(stdout);
+    }
+#endif /* DEBUG */
+
+  /**************************************************************************************************/
+
+  /***************************************** get the domain from each process *************************/
+
+  for (int task=0 ; task<nranks ; task++)
+    {
+      if (rank == MASTER)
+	{
+	  if (task)
+	    {
+	      MPI_Status status;
+	      /* number of grid points to receive (including ghost points) */
+	      const int nrows = (boundaries[task].end - boundaries[task].start + 1);
+	      const int elements = (nrows * (grid_dim + 2));
+	      MPI_Recv((void *)&buffer[boundaries[task].start][0], elements, MPI_MyDatatype, task, 0, comm, &status);
+	      if ((status.MPI_ERROR != MPI_SUCCESS) || (status.MPI_SOURCE != task))
+		{
+		  free(boundaries);
+		  MPI_Abort(comm, EXIT_FAILURE);
+		  exit(EXIT_FAILURE);
+		}
+	    }
+	} /* MASTER */
+      else if (task == rank)
+	{
+	  const int nrows = (domain->end - domain->start + 1);
+	  const int elements = (nrows * (grid_dim + 2));
+	  MPI_Send((void *)&buffer[domain->start][0], elements, MPI_MyDatatype, MASTER, 0, comm);
+	}
+    } /* loop over nranks */  
+  
+  /***************************************************************************************************/
+  
+  if (rank == MASTER)
+    free(boundaries);
+  
+  return;
+}
+
+/* ********************************************************************* */
+void WriteSolution(MyData **const phi,
+		   const int      nx,
+		   const int      ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+    
+  static int nfile = 0;  /* File counter */
+
+  char fname[32];
+  sprintf(fname,"jacobi2D_mpi_send_recv_blocking_%02d.bin", nfile);
+    
+  FILE *fp;
+  printf ("> Writing %s\n",fname);
+  fp = fopen(fname, "wb");
+
+  /* discard boundaies */
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      fwrite (phi[j] + ibeg, sizeof(MyData), nx, fp);
+    }
+    
+  nfile++;
+  fclose(fp);
+}
diff --git a/jacobi/mpi/Send_Recv_blocking/src/tools.c b/jacobi/mpi/Send_Recv_blocking/src/tools.c
new file mode 100644
index 0000000000000000000000000000000000000000..be2761d170c23fde4d5d4aeda54af00e56abf3ce
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_blocking/src/tools.c
@@ -0,0 +1,59 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "tools.h"
+
+/* ********************************************************************* */
+MyData **Allocate_2DdblArray(const int nx, const int ny)
+/*
+ * Allocate memory for a double precision array with
+ * nx rows and ny columns
+ *********************************************************************** */
+{
+  MyData **buf = malloc(nx * sizeof(MyData *));
+  assert(buf != NULL);
+
+  buf[0] = (MyData *) malloc(nx * ny * sizeof(MyData));
+  assert(buf[0] != NULL);
+  
+  for (int j=1 ; j<nx ; j++)
+    buf[j] = buf[j-1] + ny;
+    
+  return buf;
+}
+
+/* ********************************************************************* */
+void Show_2DdblArray(const MyData **const A,
+		     const int            nx,
+		     const int            ny,
+		     const char *const    string)
+/* *********************************************************************** */
+{
+  printf ("%s\n",string);
+  printf ("------------------------------\n");
+  for (int i=0 ; i<nx ; i++)
+    {
+      for (int j=0 ; j<ny ; j++)
+	{
+	  printf ("%8.2f  ", A[i][j]);
+	}
+      printf ("\n");
+    }
+  printf ("------------------------------\n");
+}
+/* ********************************************************************* */
+
+double seconds()
+{
+  struct timeval tmp;
+  gettimeofday(&tmp, (struct timezone *)0);
+  double sec = tmp.tv_sec + ((double)tmp.tv_usec)/1000000.0;
+
+  return sec;
+}
diff --git a/jacobi/mpi/Send_Recv_nonblocking/Makefile b/jacobi/mpi/Send_Recv_nonblocking/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..4d38d38eabe9aa2bade6928c4395d23debcbd4e0
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_nonblocking/Makefile
@@ -0,0 +1,57 @@
+#######################################################################
+# Author: David Goz (david.goz@inaf.it)                               #
+# June  2024                                                          #
+#######################################################################
+#
+# To see all the compilation options
+# $ make info
+#######################################################################
+
+# make.def defines how the application is compiled
+include make.def
+include make_mpi_path
+#######################################################################
+
+.PHONY: info mpi valgrind_memcheck valgrind_callgrind valgrind_cachegrind clean
+
+info:
+	@echo ' '
+	@echo '-----------------------------------------------------------------------------------------'
+	@echo '$$ make                     ---> compile the mpi application                             '
+	@echo '$$ make valgrind_memcheck   ---> run the mpi application using Valgrind under Memcheck   '
+	@echo '$$ make valgrind_callgrind  ---> run the mpi application using Valgrind under Callgrind  '
+	@echo '$$ make valgrind_cachegrind ---> run the mpi application using Valgrind under Cachegrind '
+	@echo '$$ make clean               ---> clean up all                                            '
+	@echo '$$ make info                ---> get make info                                           '
+	@echo '-----------------------------------------------------------------------------------------'
+	@echo ' '
+
+mpi: $(PROG)
+
+valgrind_memcheck: $(PROG_MEMCHECK)
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+	mpirun -n 2 valgrind --tool=memcheck -s --default-suppressions=yes --log-file=valgrind_memcheck_log_%p.txt ./$< 5 5
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+
+valgrind_callgrind: $(PROG_CALLGRIND)
+	@echo 'oooOOO... valgrind_callgrind ...OOOooo'
+	mpirun -n 2 valgrind --tool=callgrind --dump-instr=yes --collect-jumps=yes --log-file=valgrind_callgrind_log_.%p.txt ./$< 128 128
+	@echo ' '
+	@echo 'To generate a function-by-function summary from the profile data file:'
+	@echo '$$ callgrind_annotate --auto=yes callgrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+
+valgrind_cachegrind: $(PROG_CACHEGRIND)
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+	mpirun -n 2 valgrind --tool=cachegrind --log-file=valgrind_cachegrind_log_.%p.txt ./$< 128 128
+	@echo '$$ cg_annotate --auto=yes cachegrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+
+clean:
+	rm -f *~ .*~ ./src/*~ ./src/*# ./include/*~ ./include/*# *~
+	rm -f $(PROG) $(PROG_DEBUG) $(PROG_MEMCHECK) $(PROG_CALLGRIND) $(PROG_CACHEGRIND)
+	rm -f valgrind_*.txt
+	rm -f cachegrind.out.*
+	rm -f callgrind.*
+	rm -f *bin
diff --git a/jacobi/mpi/Send_Recv_nonblocking/include/allvars.h b/jacobi/mpi/Send_Recv_nonblocking/include/allvars.h
new file mode 100644
index 0000000000000000000000000000000000000000..798e4e6a2f2a740b3ab5a1394ac85eef8da1255c
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_nonblocking/include/allvars.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#define NGHOST   1
+#define NDIM     2 /* 2D */
+#define X        0
+#define Y        1
+#define TOL      1.e-5
+
+#include <mpi.h>
+
+#define MASTER 0
+
+#if defined(SINGLE_PRECISION)
+typedef float MyData;
+#define MPI_MyDatatype MPI_FLOAT_PRECISION
+#else
+/* double precision is assumed by default */
+typedef double MyData;
+#define MPI_MyDatatype MPI_DOUBLE_PRECISION
+#endif /* SINGLE_PRECISION */
diff --git a/jacobi/mpi/Send_Recv_nonblocking/include/tools.h b/jacobi/mpi/Send_Recv_nonblocking/include/tools.h
new file mode 100644
index 0000000000000000000000000000000000000000..82859ec511710474547355ede3fc38cf33a92154
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_nonblocking/include/tools.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "allvars.h"
+#include <assert.h>
+#include <sys/time.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/* function prototypes */
+MyData **Allocate_2DdblArray(const int nx, const int ny);
+void Show_2DdblArray(const MyData **const A,
+                     const int            nx,
+                     const int            ny,
+                     const char *const    string);
+
+double seconds(void);
diff --git a/jacobi/mpi/Send_Recv_nonblocking/make.def b/jacobi/mpi/Send_Recv_nonblocking/make.def
new file mode 100644
index 0000000000000000000000000000000000000000..949ebfc52229ca2c73fa1cf642bc1b5219f8dd3b
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_nonblocking/make.def
@@ -0,0 +1,45 @@
+CC     ?= gcc
+CFLAGS ?= -Wall -Wextra -march=native
+LIBS   ?= -lm -lmpi
+
+SYSTYPE = $(strip $(shell uname -n))
+
+PROG           ?= jacobi_mpi_Send_Recv_nonblocking_$(SYSTYPE)
+PROG_DEBUG      = $(PROG)_DEBUG
+PROG_MEMCHECK   = $(PROG)_MEMCHECK
+PROG_CALLGRIND  = $(PROG)_CALLGRIND
+PROG_CACHEGRIND = $(PROG)_CACHEGRIND
+
+HEADERS       = $(wildcard ./include/*.h)
+SOURCES       = $(wildcard ./src/*.c)
+DEPENDENCIES  = $(SOURCES) $(HEADERS) Makefile
+
+$(PROG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_DEBUG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -ggdb3 -fno-omit-frame-pointer -I./include I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_DEBUG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_MEMCHECK): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_MEMCHECK) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CALLGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CALLGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CACHEGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CACHEGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
diff --git a/jacobi/mpi/Send_Recv_nonblocking/make_mpi_path b/jacobi/mpi/Send_Recv_nonblocking/make_mpi_path
new file mode 100644
index 0000000000000000000000000000000000000000..f2c3de9e0d1796034a518159f4c53a40e76919fc
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_nonblocking/make_mpi_path
@@ -0,0 +1,9 @@
+# set the MPI install path
+
+MPI_INSTALL_PATH = /home/gozzilla/software/openmpi/openmpi-5.0.3
+
+
+
+MPI        = $(MPI_INSTALL_PATH)
+MPI_INC    = $(MPI)/include
+MPI_LIB    = $(MPI)/lib
\ No newline at end of file
diff --git a/jacobi/mpi/Send_Recv_nonblocking/src/jacobi_2D_mpi_send_recv_nonblocking.c b/jacobi/mpi/Send_Recv_nonblocking/src/jacobi_2D_mpi_send_recv_nonblocking.c
new file mode 100644
index 0000000000000000000000000000000000000000..8add90d0298826b835d88f10ff2c55518fa37960
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_nonblocking/src/jacobi_2D_mpi_send_recv_nonblocking.c
@@ -0,0 +1,522 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "allvars.h"
+#include "tools.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+/* #define DEBUG */
+
+typedef struct MyRows
+{
+  int start;
+  int end;
+} myDomain;
+
+/* function prototypes */
+void BoundaryConditions(MyData **const restrict,
+			MyData  *const restrict,
+			MyData  *const restrict,
+			const int,
+			const int);
+
+void JacobiAlgorithm(MyData **const restrict, MyData **const restrict, MyData *const restrict,
+		     const MyData *const restrict, const int, const int, const int, const int);
+
+void get_domains(MyData  **const buffer,
+		 myDomain *const domain,
+		 const int       grid_dim,
+		 const int       rank,
+		 const int       nranks,
+		 const MPI_Comm  comm);
+
+void mpi_exchange_nonblocking_1d(MyData **const buffer,
+				 const int      n,
+				 const int      nbrtop,
+				 const int      nbrbottom,
+				 const int      start,
+				 const int      end,
+				 const MPI_Comm comm1d);
+
+void WriteSolution(MyData **const phi, const int nx, const int ny);
+
+void copy_grids(MyData **const restrict A,
+                MyData **const restrict B,
+                const int               xbeg,
+                const int               xend,
+                const int               ybeg,
+		const int               yend);
+
+int main(int argc, char **argv)
+{
+  int rank, Nranks;
+  const MPI_Comm comm = MPI_COMM_WORLD;
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &Nranks);
+
+  if ((argc <= 1) && !rank)
+    {
+      printf("\n\t Usage: <executable> <grid_size> \n\n");
+      MPI_Abort(comm, EXIT_FAILURE);
+      exit(EXIT_FAILURE);
+    }
+
+  /* global X and Y grid size (square matrix supposed) */
+  const int NX_GLOB = (int) strtol(argv[1], NULL, 10);
+  const int NY_GLOB = NX_GLOB;
+
+  /******************************************************************************************************/
+  /* 1D MPI-decomposition:
+     the physical domain is sliced into slabs along the vertical direction,
+     while the grids are replicated across the MPI processes.
+     This approach is not the most efficient in terms of memory usage,
+     because the arrays are replicated across MPI process instead of to be distributed */
+
+  /* get the reminder, i.e. take into account uneven
+     decomposition of points among the processes    */
+  const int rem = (NX_GLOB + 2) % Nranks;
+
+  /* get the amount of data for each MPI process:
+     - chunk is supposed to be >= 1 */
+  const int chunk = (NX_GLOB + 2 - rem) / Nranks;
+  if (chunk < 1)
+    {
+      printf("\n\t chunk < 1 ... aborting ...[rank: %d]\n", rank);
+      MPI_Abort(comm, EXIT_FAILURE);
+      exit(EXIT_FAILURE);
+    }
+  
+  /* get the slab dimension along vertical direction: */
+  int incr, offset;
+  if (rank < rem)
+    {
+      incr = chunk + 1;
+      offset = 0;
+    }
+  else
+    {
+      incr = chunk;
+      offset = rem;
+    }
+
+  myDomain domain;
+  domain.start = ((rank * incr) + offset);
+  domain.end   = (domain.start + incr) - 1;
+  /* boundaries */
+  domain.start = ((domain.start == 0)         ? NGHOST  : domain.start);
+  domain.end   = ((domain.end == NX_GLOB + 1) ? NX_GLOB : domain.end);
+
+  /* rank of the top process */
+  const int nbrtop    = (((rank + 1) >= Nranks) ? MPI_PROC_NULL : (rank + 1));
+  /* rank of the bottom process */
+  const int nbrbottom = (((rank - 1) < 0)       ? MPI_PROC_NULL : (rank - 1));
+
+  /******************************************************************************************************/
+  
+  const MyData xbeg = 0.0;
+  const MyData xend = 1.0;
+  const MyData ybeg = 0.0;
+  const MyData yend = 1.0;
+
+  const MyData delta[NDIM] = {(xend - xbeg)/(NX_GLOB + 1),
+			      (yend - ybeg)/(NY_GLOB + 1)};
+
+  /* --------------------------------------------------------
+     1. Set grid indices
+     -------------------------------------------------------- */
+    
+  const int ibeg   = NGHOST;
+  const int iend   = ibeg + NX_GLOB - 1;
+  const int nx     = iend - ibeg + 1;
+  const int nx_tot = nx + 2 * NGHOST;
+    
+  const int jbeg   = NGHOST;
+  const int jend   = jbeg + NY_GLOB - 1;
+  const int ny     = jend - jbeg + 1;
+  const int ny_tot = ny + 2 * NGHOST;
+
+  if (rank == MASTER)
+    {
+      printf("\n\t Grid indices:");
+      printf("\n\t\t ibeg, iend = %d, %d; nx_tot = %d"    ,ibeg, iend, nx_tot);
+      printf("\n\t\t jbeg, jend = %d, %d; ny_tot = %d\n\n",jbeg, jend, ny_tot);
+    }
+
+#if defined(DEBUG)
+  for (int task=0 ; task<Nranks ; task++)
+    {
+      MPI_Barrier(comm);
+      
+      if (task == rank)
+	{
+	  printf("\n\t rank: %d", rank);
+	  printf("\n\t\t domain.start: %d - domain.end: %d", domain.start, domain.end);
+	  printf("\n\t\t nbrtop: %d - nbrbottom: %d \n", nbrtop, nbrbottom);
+	  fflush(stdout);
+	}
+    }
+
+  MPI_Barrier(comm);
+#endif /* DEBUG */
+  
+  /* --------------------------------------------------------
+     2. Generate grid, allocate memory
+        Not optimized because the grids are (unnecessarily)
+	replicated across MPI processes
+     -------------------------------------------------------- */
+
+  /* memory allocation */
+  MyData *xg = (MyData *) malloc((NX_GLOB + 2*NGHOST) * sizeof(MyData));
+  MyData *yg = (MyData *) malloc((NY_GLOB + 2*NGHOST) * sizeof(MyData));
+  assert((xg != NULL) && (yg != NULL));
+
+  /* initial conditions */
+  for (int i=0 ; i<(NX_GLOB + 2*NGHOST) ; i++) xg[i] = xbeg + (i - ibeg + 1) * delta[X];
+  for (int j=0 ; j<(NY_GLOB + 2*NGHOST) ; j++) yg[j] = ybeg + (j - jbeg + 1) * delta[Y];
+  MyData *x = xg; /* Global and local grids are the same  */
+  MyData *y = yg; /* for serial version of the code       */
+
+  /* grids memory allocation */
+  MyData **phi  = Allocate_2DdblArray(ny_tot, nx_tot);
+  MyData **phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
+    
+  /* --------------------------------------------------------
+     3. Initialize solution array to 0
+     -------------------------------------------------------- */
+    
+  for (int j=jbeg ; j<=jend ; j++)
+    for (int i=ibeg ; i<=iend ; i++)
+      {
+	phi0[j][i] = 0.0;
+	phi[j][i]  = 0.0;
+      }
+  
+  /* --------------------------------------------------------
+     4. Main iteration cycle
+     -------------------------------------------------------- */
+
+  const double time_start = MPI_Wtime();
+  
+  /* -- 4a. Set boundary conditions first -- */  
+  BoundaryConditions(phi0, x, y, nx, ny);
+  BoundaryConditions(phi, x, y, nx, ny);
+
+  MyData err = 1.0;
+  /* iterations */
+  int k = 0;
+  while (1)
+    {      
+      /* -- 4c. Jacobi's method and residual (interior points) -- */
+      /*       core algorithm                                     */
+
+      err = 0.0;
+      JacobiAlgorithm(phi, phi0, &err, delta,
+		      ibeg, iend, domain.start, domain.end);
+
+      if (!rank)
+	printf("\n\t Iteration = %d - err = %lg\n",k, err);
+
+      /* increase the counter of loop iterations */
+      k++;
+
+      /* get the total error */
+      MyData toterr;
+      /* combines values from all processes and distributes the result back to all processes */
+      MPI_Allreduce(&err, &toterr, 1, MPI_MyDatatype, MPI_SUM, comm);
+      
+      /* check convergence */
+      if (toterr <= TOL)
+	{
+	  /* master task gathers all the domains */
+	  get_domains(phi, &domain, NY_GLOB, rank, Nranks, comm);
+
+	  break;
+	}
+
+      /* -- 4b. MPI communications */
+      mpi_exchange_nonblocking_1d(phi, NX_GLOB,
+				  nbrtop, nbrbottom,
+				  domain.start, domain.end, comm);
+
+      /* swap the pointers */
+      MyData **tmp = phi;
+      phi = phi0;
+      phi0 = tmp;
+    }
+
+  /* master rank writes the solution */
+  if (!rank)
+    {
+      WriteSolution(phi, nx, ny);
+      
+      printf("\n\t NX_GLOB x NY_GLOB = %d x %d\n", NX_GLOB, NY_GLOB);
+      printf("\n\t Time = %lf [s]\n\n", MPI_Wtime() - time_start);
+    }
+
+  // free memory
+  if (phi0)
+    {
+      free(phi0[0]);
+      free(phi0);
+    }
+
+  if (phi)
+    {
+      free(phi[0]);
+      free(phi);
+    }
+
+  if (yg)
+    free(yg);
+  
+  if (xg)
+    free(xg);
+
+  MPI_Finalize();
+  
+  return 0;
+}
+
+/* ********************************************************************* */
+void BoundaryConditions(MyData **const restrict phi,
+			MyData  *const restrict x,
+			MyData  *const restrict y,
+                        const int               nx,
+			const int               ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;
+  const int iend = ibeg + nx - 1;
+    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+
+  int i,j;
+  
+  /* -- Left -- */
+  i = ibeg - 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (1.0 - y[j]);
+    
+  /* -- Right -- */
+  i = jend + 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (y[j] * y[j]);
+    
+  /* -- Bottom -- */    
+  j = jbeg - 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = (1.0 - x[i]);
+    
+  /* -- Top -- */
+  j = jend + 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = x[i];
+
+  return;
+}
+
+/* ********************************************************************* */
+
+void JacobiAlgorithm(MyData      **const restrict Phi,
+		     MyData      **const restrict Phi0,
+		     MyData       *const restrict error,
+		     const MyData *const restrict delta,
+		     const int                    ibeg,
+		     const int                    iend,
+		     const int                    jbeg,
+		     const int                    jend)
+{
+  *error = 0.0;
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      for (int i=ibeg ; i<=iend ; i++)
+	{
+	  Phi[j][i] = 0.25 * (Phi0[j][i-1] + Phi0[j][i+1] +
+			      Phi0[j-1][i] + Phi0[j+1][i]);
+                
+	  *error += delta[X] * delta[Y] * fabs(Phi[j][i] - Phi0[j][i]);
+	} /* loop over columns */
+    } /* loop over rows */
+  
+  return;
+}
+
+/* ********************************************************************* */
+void mpi_exchange_nonblocking_1d(MyData **const buffer,
+				 const int      n,
+				 const int      nbrtop,
+				 const int      nbrbottom,
+				 const int      start,
+				 const int      end,
+				 const MPI_Comm comm1d)
+{
+  /* The function is called by each MPI rank
+     - nbrtop is the MPI process with rank + 1
+     - nbrbottom is the MPI process with rank - 1 */
+
+  /* communication request (handle) */
+  MPI_Request request[4];
+
+  /************************* Process is ready to receive data ****************************/
+  /* Perform a nonblocking receive from the bottom (rank-1) process                      */
+  MPI_Irecv(&buffer[start-1][1], n, MPI_MyDatatype, nbrbottom, 0, comm1d, &request[0]);
+
+  /* Perform a nonblocking receive from the top (rank+1) process                         */
+  MPI_Irecv(&buffer[end+1][1], n, MPI_MyDatatype, nbrtop, 1, comm1d, &request[1]);
+  /***************************************************************************************/
+
+  /*************** Process begins to send data *******************************************/
+  /* Perform a nonblocking send to the top (rank+1) process                              */
+  MPI_Isend(&buffer[end][1], n, MPI_MyDatatype, nbrtop, 0, comm1d, &request[2]);
+
+  /* - Perform a nonblocking send to the bottom (rank-1) process                         */
+  MPI_Isend(&buffer[start][1], n, MPI_MyDatatype, nbrbottom, 1, comm1d, &request[3]);
+  /***************************************************************************************/
+
+  /* some useful work (computation) could be performed (while the buffer containing the  */
+  /* message has not to be modified)                                                     */
+
+  /******************** Waits for all given MPI Requests to complete *********************/
+  MPI_Waitall(4, request, MPI_STATUSES_IGNORE);
+  
+  return;
+}
+
+/* ********************************************************************* */
+
+void get_domains(MyData  **const buffer,
+		 myDomain *const domain,
+		 const int       grid_dim,
+		 const int       rank,
+		 const int       nranks,
+		 const MPI_Comm  comm)
+{
+  /* Master process gathers all the domains */
+  
+  /***************************** get the domain boundaries from each process *************************************/
+  myDomain *boundaries = NULL;
+  if (rank == MASTER)
+    {
+      boundaries = (myDomain *)malloc(nranks * sizeof(*boundaries));
+      if (boundaries == NULL)
+	{
+	  MPI_Abort(comm, EXIT_FAILURE);
+	  exit(EXIT_FAILURE);
+	}
+      boundaries[MASTER].start = domain->start;
+      boundaries[MASTER].end   = domain->end;
+    }
+
+  for (int task=0 ; task<nranks ; task++)
+    {
+      if (rank == MASTER)
+	{
+	  if (task)
+	    {
+	      MPI_Status status;
+	      MPI_Recv((void *)&boundaries[task], sizeof(myDomain), MPI_BYTE, task, 0, comm, &status);
+	      if ((status.MPI_ERROR != MPI_SUCCESS) || (status.MPI_SOURCE != task))
+		{
+		  free(boundaries);
+		  MPI_Abort(comm, EXIT_FAILURE);
+		  exit(EXIT_FAILURE);
+		}
+	    }
+#if defined(DEBUG)
+	  printf("\n\t Diplacements[%d].start = %d - Diplacements[%d].end = %d",
+		 task, boundaries[task].start, task, boundaries[task].end);
+#endif /* DEBUG */	  
+	} /* MASTER */
+      else if (task == rank)
+	{
+	  MPI_Send((void *)domain, sizeof(myDomain), MPI_BYTE, MASTER, 0, comm);
+	}
+    } /* loop over nranks */
+
+#if defined(DEBUG)
+  if (rank == MASTER)
+    {
+      printf("\n");
+      fflush(stdout);
+    }
+#endif /* DEBUG */
+
+  /**************************************************************************************************/
+
+  /***************************************** get the domain from each process *************************/
+
+  for (int task=0 ; task<nranks ; task++)
+    {
+      if (rank == MASTER)
+	{
+	  if (task)
+	    {
+	      MPI_Status status;
+	      /* number of grid points to receive (including ghost points) */
+	      const int nrows = (boundaries[task].end - boundaries[task].start + 1);
+	      const int elements = (nrows * (grid_dim + 2));
+	      MPI_Recv((void *)&buffer[boundaries[task].start][0], elements, MPI_MyDatatype, task, 0, comm, &status);
+	      if ((status.MPI_ERROR != MPI_SUCCESS) || (status.MPI_SOURCE != task))
+		{
+		  free(boundaries);
+		  MPI_Abort(comm, EXIT_FAILURE);
+		  exit(EXIT_FAILURE);
+		}
+	    }
+	} /* MASTER */
+      else if (task == rank)
+	{
+	  const int nrows = (domain->end - domain->start + 1);
+	  const int elements = (nrows * (grid_dim + 2));
+	  MPI_Send((void *)&buffer[domain->start][0], elements, MPI_MyDatatype, MASTER, 0, comm);
+	}
+    } /* loop over nranks */  
+  
+  /***************************************************************************************************/
+  
+  if (rank == MASTER)
+    free(boundaries);
+  
+  return;
+}
+
+/* ********************************************************************* */
+void WriteSolution(MyData **const phi,
+		   const int      nx,
+		   const int      ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+    
+  static int nfile = 0;  /* File counter */
+
+  char fname[128];
+  sprintf(fname,"jacobi2D_mpi_send_recv_nonblocking_%02d.bin", nfile);
+    
+  FILE *fp;
+  printf ("> Writing %s\n",fname);
+  fp = fopen(fname, "wb");
+
+  /* discard boundaies */
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      fwrite (phi[j] + ibeg, sizeof(MyData), nx, fp);
+    }
+    
+  nfile++;
+  fclose(fp);
+}
diff --git a/jacobi/mpi/Send_Recv_nonblocking/src/tools.c b/jacobi/mpi/Send_Recv_nonblocking/src/tools.c
new file mode 100644
index 0000000000000000000000000000000000000000..be2761d170c23fde4d5d4aeda54af00e56abf3ce
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_nonblocking/src/tools.c
@@ -0,0 +1,59 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "tools.h"
+
+/* ********************************************************************* */
+MyData **Allocate_2DdblArray(const int nx, const int ny)
+/*
+ * Allocate memory for a double precision array with
+ * nx rows and ny columns
+ *********************************************************************** */
+{
+  MyData **buf = malloc(nx * sizeof(MyData *));
+  assert(buf != NULL);
+
+  buf[0] = (MyData *) malloc(nx * ny * sizeof(MyData));
+  assert(buf[0] != NULL);
+  
+  for (int j=1 ; j<nx ; j++)
+    buf[j] = buf[j-1] + ny;
+    
+  return buf;
+}
+
+/* ********************************************************************* */
+void Show_2DdblArray(const MyData **const A,
+		     const int            nx,
+		     const int            ny,
+		     const char *const    string)
+/* *********************************************************************** */
+{
+  printf ("%s\n",string);
+  printf ("------------------------------\n");
+  for (int i=0 ; i<nx ; i++)
+    {
+      for (int j=0 ; j<ny ; j++)
+	{
+	  printf ("%8.2f  ", A[i][j]);
+	}
+      printf ("\n");
+    }
+  printf ("------------------------------\n");
+}
+/* ********************************************************************* */
+
+double seconds()
+{
+  struct timeval tmp;
+  gettimeofday(&tmp, (struct timezone *)0);
+  double sec = tmp.tv_sec + ((double)tmp.tv_usec)/1000000.0;
+
+  return sec;
+}
diff --git a/jacobi/mpi/Send_Recv_paired/Makefile b/jacobi/mpi/Send_Recv_paired/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..4d38d38eabe9aa2bade6928c4395d23debcbd4e0
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_paired/Makefile
@@ -0,0 +1,57 @@
+#######################################################################
+# Author: David Goz (david.goz@inaf.it)                               #
+# June  2024                                                          #
+#######################################################################
+#
+# To see all the compilation options
+# $ make info
+#######################################################################
+
+# make.def defines how the application is compiled
+include make.def
+include make_mpi_path
+#######################################################################
+
+.PHONY: info mpi valgrind_memcheck valgrind_callgrind valgrind_cachegrind clean
+
+info:
+	@echo ' '
+	@echo '-----------------------------------------------------------------------------------------'
+	@echo '$$ make                     ---> compile the mpi application                             '
+	@echo '$$ make valgrind_memcheck   ---> run the mpi application using Valgrind under Memcheck   '
+	@echo '$$ make valgrind_callgrind  ---> run the mpi application using Valgrind under Callgrind  '
+	@echo '$$ make valgrind_cachegrind ---> run the mpi application using Valgrind under Cachegrind '
+	@echo '$$ make clean               ---> clean up all                                            '
+	@echo '$$ make info                ---> get make info                                           '
+	@echo '-----------------------------------------------------------------------------------------'
+	@echo ' '
+
+mpi: $(PROG)
+
+valgrind_memcheck: $(PROG_MEMCHECK)
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+	mpirun -n 2 valgrind --tool=memcheck -s --default-suppressions=yes --log-file=valgrind_memcheck_log_%p.txt ./$< 5 5
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+
+valgrind_callgrind: $(PROG_CALLGRIND)
+	@echo 'oooOOO... valgrind_callgrind ...OOOooo'
+	mpirun -n 2 valgrind --tool=callgrind --dump-instr=yes --collect-jumps=yes --log-file=valgrind_callgrind_log_.%p.txt ./$< 128 128
+	@echo ' '
+	@echo 'To generate a function-by-function summary from the profile data file:'
+	@echo '$$ callgrind_annotate --auto=yes callgrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+
+valgrind_cachegrind: $(PROG_CACHEGRIND)
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+	mpirun -n 2 valgrind --tool=cachegrind --log-file=valgrind_cachegrind_log_.%p.txt ./$< 128 128
+	@echo '$$ cg_annotate --auto=yes cachegrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+
+clean:
+	rm -f *~ .*~ ./src/*~ ./src/*# ./include/*~ ./include/*# *~
+	rm -f $(PROG) $(PROG_DEBUG) $(PROG_MEMCHECK) $(PROG_CALLGRIND) $(PROG_CACHEGRIND)
+	rm -f valgrind_*.txt
+	rm -f cachegrind.out.*
+	rm -f callgrind.*
+	rm -f *bin
diff --git a/jacobi/mpi/Send_Recv_paired/include/allvars.h b/jacobi/mpi/Send_Recv_paired/include/allvars.h
new file mode 100644
index 0000000000000000000000000000000000000000..798e4e6a2f2a740b3ab5a1394ac85eef8da1255c
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_paired/include/allvars.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#define NGHOST   1
+#define NDIM     2 /* 2D */
+#define X        0
+#define Y        1
+#define TOL      1.e-5
+
+#include <mpi.h>
+
+#define MASTER 0
+
+#if defined(SINGLE_PRECISION)
+typedef float MyData;
+#define MPI_MyDatatype MPI_FLOAT_PRECISION
+#else
+/* double precision is assumed by default */
+typedef double MyData;
+#define MPI_MyDatatype MPI_DOUBLE_PRECISION
+#endif /* SINGLE_PRECISION */
diff --git a/jacobi/mpi/Send_Recv_paired/include/tools.h b/jacobi/mpi/Send_Recv_paired/include/tools.h
new file mode 100644
index 0000000000000000000000000000000000000000..82859ec511710474547355ede3fc38cf33a92154
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_paired/include/tools.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "allvars.h"
+#include <assert.h>
+#include <sys/time.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/* function prototypes */
+MyData **Allocate_2DdblArray(const int nx, const int ny);
+void Show_2DdblArray(const MyData **const A,
+                     const int            nx,
+                     const int            ny,
+                     const char *const    string);
+
+double seconds(void);
diff --git a/jacobi/mpi/Send_Recv_paired/make.def b/jacobi/mpi/Send_Recv_paired/make.def
new file mode 100644
index 0000000000000000000000000000000000000000..3a66a18bfeab4440599ee89539e658d9a376dbf6
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_paired/make.def
@@ -0,0 +1,45 @@
+CC     ?= gcc
+CFLAGS ?= -Wall -Wextra -march=native
+LIBS   ?= -lm -lmpi
+
+SYSTYPE = $(strip $(shell uname -n))
+
+PROG           ?= jacobi_mpi_Send_Recv_paired_$(SYSTYPE)
+PROG_DEBUG      = $(PROG)_DEBUG
+PROG_MEMCHECK   = $(PROG)_MEMCHECK
+PROG_CALLGRIND  = $(PROG)_CALLGRIND
+PROG_CACHEGRIND = $(PROG)_CACHEGRIND
+
+HEADERS       = $(wildcard ./include/*.h)
+SOURCES       = $(wildcard ./src/*.c)
+DEPENDENCIES  = $(SOURCES) $(HEADERS) Makefile
+
+$(PROG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_DEBUG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -ggdb3 -fno-omit-frame-pointer -I./include I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_DEBUG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_MEMCHECK): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_MEMCHECK) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CALLGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CALLGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CACHEGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CACHEGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
diff --git a/jacobi/mpi/Send_Recv_paired/make_mpi_path b/jacobi/mpi/Send_Recv_paired/make_mpi_path
new file mode 100644
index 0000000000000000000000000000000000000000..f2c3de9e0d1796034a518159f4c53a40e76919fc
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_paired/make_mpi_path
@@ -0,0 +1,9 @@
+# set the MPI install path
+
+MPI_INSTALL_PATH = /home/gozzilla/software/openmpi/openmpi-5.0.3
+
+
+
+MPI        = $(MPI_INSTALL_PATH)
+MPI_INC    = $(MPI)/include
+MPI_LIB    = $(MPI)/lib
\ No newline at end of file
diff --git a/jacobi/mpi/Send_Recv_paired/src/jacobi_2D_mpi_send_recv_paired.c b/jacobi/mpi/Send_Recv_paired/src/jacobi_2D_mpi_send_recv_paired.c
new file mode 100644
index 0000000000000000000000000000000000000000..c78c890e39e8738b54a6d54025b607eefd24b3f5
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_paired/src/jacobi_2D_mpi_send_recv_paired.c
@@ -0,0 +1,537 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "allvars.h"
+#include "tools.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+/* #define DEBUG */
+
+typedef struct MyRows
+{
+  int start;
+  int end;
+} myDomain;
+
+/* function prototypes */
+void BoundaryConditions(MyData **const restrict,
+			MyData  *const restrict,
+			MyData  *const restrict,
+			const int,
+			const int);
+
+void JacobiAlgorithm(MyData **const restrict, MyData **const restrict, MyData *const restrict,
+		     const MyData *const restrict, const int, const int, const int, const int);
+
+void get_domains(MyData  **const buffer,
+		 myDomain *const domain,
+		 const int       grid_dim,
+		 const int       rank,
+		 const int       nranks,
+		 const MPI_Comm  comm);
+
+void mpi_exchange_paired_1d(MyData **const buffer,
+			    const int      n,
+			    const int      nbrtop,
+			    const int      nbrbottom,
+			    const int      start,
+			    const int      end,
+			    const MPI_Comm comm1d);
+
+void WriteSolution(MyData **const phi, const int nx, const int ny);
+
+void copy_grids(MyData **const restrict A,
+                MyData **const restrict B,
+                const int               xbeg,
+                const int               xend,
+                const int               ybeg,
+		const int               yend);
+
+int main(int argc, char **argv)
+{
+  int rank, Nranks;
+  const MPI_Comm comm = MPI_COMM_WORLD;
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &Nranks);
+
+  if ((argc <= 1) && !rank)
+    {
+      printf("\n\t Usage: <executable> <grid_size> \n\n");
+      MPI_Abort(comm, EXIT_FAILURE);
+      exit(EXIT_FAILURE);
+    }
+
+  /* global X and Y grid size (square matrix supposed) */
+  const int NX_GLOB = (int) strtol(argv[1], NULL, 10);
+  const int NY_GLOB = NX_GLOB;
+
+  /******************************************************************************************************/
+  /* 1D MPI-decomposition:
+     the physical domain is sliced into slabs along the vertical direction,
+     while the grids are replicated across the MPI processes.
+     This approach is not the most efficient in terms of memory usage,
+     because the arrays are replicated across MPI process instead of to be distributed */
+
+  /* get the reminder, i.e. take into account uneven
+     decomposition of points among the processes    */
+  const int rem = (NX_GLOB + 2) % Nranks;
+
+  /* get the amount of data for each MPI process:
+     - chunk is supposed to be >= 1 */
+  const int chunk = (NX_GLOB + 2 - rem) / Nranks;
+  if (chunk < 1)
+    {
+      printf("\n\t chunk < 1 ... aborting ...[rank: %d]\n", rank);
+      MPI_Abort(comm, EXIT_FAILURE);
+      exit(EXIT_FAILURE);
+    }
+  
+  /* get the slab dimension along vertical direction: */
+  int incr, offset;
+  if (rank < rem)
+    {
+      incr = chunk + 1;
+      offset = 0;
+    }
+  else
+    {
+      incr = chunk;
+      offset = rem;
+    }
+
+  myDomain domain;
+  domain.start = ((rank * incr) + offset);
+  domain.end   = (domain.start + incr) - 1;
+  /* boundaries */
+  domain.start = ((domain.start == 0)         ? NGHOST  : domain.start);
+  domain.end   = ((domain.end == NX_GLOB + 1) ? NX_GLOB : domain.end);
+
+  /* rank of the top process */
+  const int nbrtop    = (((rank + 1) >= Nranks) ? MPI_PROC_NULL : (rank + 1));
+  /* rank of the bottom process */
+  const int nbrbottom = (((rank - 1) < 0)       ? MPI_PROC_NULL : (rank - 1));
+
+  /******************************************************************************************************/
+  
+  const MyData xbeg = 0.0;
+  const MyData xend = 1.0;
+  const MyData ybeg = 0.0;
+  const MyData yend = 1.0;
+
+  const MyData delta[NDIM] = {(xend - xbeg)/(NX_GLOB + 1),
+			      (yend - ybeg)/(NY_GLOB + 1)};
+
+  /* --------------------------------------------------------
+     1. Set grid indices
+     -------------------------------------------------------- */
+    
+  const int ibeg   = NGHOST;
+  const int iend   = ibeg + NX_GLOB - 1;
+  const int nx     = iend - ibeg + 1;
+  const int nx_tot = nx + 2 * NGHOST;
+    
+  const int jbeg   = NGHOST;
+  const int jend   = jbeg + NY_GLOB - 1;
+  const int ny     = jend - jbeg + 1;
+  const int ny_tot = ny + 2 * NGHOST;
+
+  if (rank == MASTER)
+    {
+      printf("\n\t Grid indices:");
+      printf("\n\t\t ibeg, iend = %d, %d; nx_tot = %d"    ,ibeg, iend, nx_tot);
+      printf("\n\t\t jbeg, jend = %d, %d; ny_tot = %d\n\n",jbeg, jend, ny_tot);
+    }
+
+#if defined(DEBUG)
+  for (int task=0 ; task<Nranks ; task++)
+    {
+      MPI_Barrier(comm);
+      
+      if (task == rank)
+	{
+	  printf("\n\t rank: %d", rank);
+	  printf("\n\t\t domain.start: %d - domain.end: %d", domain.start, domain.end);
+	  printf("\n\t\t nbrtop: %d - nbrbottom: %d \n", nbrtop, nbrbottom);
+	  fflush(stdout);
+	}
+    }
+
+  MPI_Barrier(comm);
+#endif /* DEBUG */
+  
+  /* --------------------------------------------------------
+     2. Generate grid, allocate memory
+        Not optimized because the grids are (unnecessarily)
+	replicated across MPI processes
+     -------------------------------------------------------- */
+
+  /* memory allocation */
+  MyData *xg = (MyData *) malloc((NX_GLOB + 2*NGHOST) * sizeof(MyData));
+  MyData *yg = (MyData *) malloc((NY_GLOB + 2*NGHOST) * sizeof(MyData));
+  assert((xg != NULL) && (yg != NULL));
+
+  /* initial conditions */
+  for (int i=0 ; i<(NX_GLOB + 2*NGHOST) ; i++) xg[i] = xbeg + (i - ibeg + 1) * delta[X];
+  for (int j=0 ; j<(NY_GLOB + 2*NGHOST) ; j++) yg[j] = ybeg + (j - jbeg + 1) * delta[Y];
+  MyData *x = xg; /* Global and local grids are the same  */
+  MyData *y = yg; /* for serial version of the code       */
+
+  /* grids memory allocation */
+  MyData **phi  = Allocate_2DdblArray(ny_tot, nx_tot);
+  MyData **phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
+    
+  /* --------------------------------------------------------
+     3. Initialize solution array to 0
+     -------------------------------------------------------- */
+    
+  for (int j=jbeg ; j<=jend ; j++)
+    for (int i=ibeg ; i<=iend ; i++)
+      {
+	phi0[j][i] = 0.0;
+	phi[j][i]  = 0.0;
+      }
+  
+  /* --------------------------------------------------------
+     4. Main iteration cycle
+     -------------------------------------------------------- */
+
+  const double time_start = MPI_Wtime();
+  
+  /* -- 4a. Set boundary conditions first -- */  
+  BoundaryConditions(phi0, x, y, nx, ny);
+  BoundaryConditions(phi, x, y, nx, ny);
+
+  MyData err = 1.0;
+  /* iterations */
+  int k = 0;
+  while (1)
+    {      
+      /* -- 4c. Jacobi's method and residual (interior points) -- */
+      /*       core algorithm                                     */
+
+      err = 0.0;
+      JacobiAlgorithm(phi, phi0, &err, delta,
+		      ibeg, iend, domain.start, domain.end);
+
+      if (!rank)
+	printf("\n\t Iteration = %d - err = %lg\n",k, err);
+
+      /* increase the counter of loop iterations */
+      k++;
+
+      /* get the total error */
+      MyData toterr;
+      /* combines values from all processes and distributes the result back to all processes */
+      MPI_Allreduce(&err, &toterr, 1, MPI_MyDatatype, MPI_SUM, comm);
+      
+      /* check convergence */
+      if (toterr <= TOL)
+	{
+	  /* master task gathers all the domains */
+	  get_domains(phi, &domain, NY_GLOB, rank, Nranks, comm);
+
+	  break;
+	}
+
+      /* -- 4b. MPI communications */
+      mpi_exchange_paired_1d(phi, NX_GLOB,
+			     nbrtop, nbrbottom,
+			     domain.start, domain.end, comm);
+
+      /* swap the pointers */
+      MyData **tmp = phi;
+      phi = phi0;
+      phi0 = tmp;
+    }
+
+  /* master rank writes the solution */
+  if (!rank)
+    {
+      WriteSolution(phi, nx, ny);
+      
+      printf("\n\t NX_GLOB x NY_GLOB = %d x %d\n", NX_GLOB, NY_GLOB);
+      printf("\n\t Time = %lf [s]\n\n", MPI_Wtime() - time_start);
+    }
+
+  // free memory
+  if (phi0)
+    {
+      free(phi0[0]);
+      free(phi0);
+    }
+
+  if (phi)
+    {
+      free(phi[0]);
+      free(phi);
+    }
+
+  if (yg)
+    free(yg);
+  
+  if (xg)
+    free(xg);
+
+  MPI_Finalize();
+  
+  return 0;
+}
+
+/* ********************************************************************* */
+void BoundaryConditions(MyData **const restrict phi,
+			MyData  *const restrict x,
+			MyData  *const restrict y,
+                        const int               nx,
+			const int               ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;
+  const int iend = ibeg + nx - 1;
+    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+
+  int i,j;
+  
+  /* -- Left -- */
+  i = ibeg - 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (1.0 - y[j]);
+    
+  /* -- Right -- */
+  i = jend + 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (y[j] * y[j]);
+    
+  /* -- Bottom -- */    
+  j = jbeg - 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = (1.0 - x[i]);
+    
+  /* -- Top -- */
+  j = jend + 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = x[i];
+
+  return;
+}
+
+/* ********************************************************************* */
+
+void JacobiAlgorithm(MyData      **const restrict Phi,
+		     MyData      **const restrict Phi0,
+		     MyData       *const restrict error,
+		     const MyData *const restrict delta,
+		     const int                    ibeg,
+		     const int                    iend,
+		     const int                    jbeg,
+		     const int                    jend)
+{
+  *error = 0.0;
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      for (int i=ibeg ; i<=iend ; i++)
+	{
+	  Phi[j][i] = 0.25 * (Phi0[j][i-1] + Phi0[j][i+1] +
+			      Phi0[j-1][i] + Phi0[j+1][i]);
+                
+	  *error += delta[X] * delta[Y] * fabs(Phi[j][i] - Phi0[j][i]);
+	} /* loop over columns */
+    } /* loop over rows */
+  
+  return;
+}
+
+/* ********************************************************************* */
+void mpi_exchange_paired_1d(MyData **const buffer,
+			    const int      n,
+			    const int      nbrtop,
+			    const int      nbrbottom,
+			    const int      start,
+			    const int      end,
+			    const MPI_Comm comm1d)
+{
+  /* The function is called by each MPI rank      */
+  /* - nbrtop    is the MPI process with rank + 1 */
+  /* - nbrbottom is the MPI process with rank - 1 */
+
+  int rank;
+  MPI_Comm_rank(comm1d, &rank);
+  
+  if ((rank % 2) == 0) /* even rank */
+    {
+      /***************** First communication stage ************************************************/
+      /* Perform a blocking send to the top (rank+1) process */
+      MPI_Send(&buffer[end][1], n, MPI_MyDatatype, nbrtop, 0, comm1d);
+
+      /* Perform a blocking receive from the bottom (rank-1) process */
+      MPI_Recv(&buffer[start-1][1], n, MPI_MyDatatype, nbrbottom, 0, comm1d, MPI_STATUS_IGNORE);
+      /********************************************************************************************/
+
+      /**************** Second communication stage ************************************************/
+      /* - Perform a blocking send to the bottom (rank-1) process */
+      MPI_Send(&buffer[start][1], n, MPI_MyDatatype, nbrbottom, 1, comm1d);
+
+      /* Perform a blocking receive from the top (rank+1) process */
+      MPI_Recv(&buffer[end+1][1], n, MPI_MyDatatype, nbrtop, 1, comm1d, MPI_STATUS_IGNORE);
+      /********************************************************************************************/
+    }
+  else /* odd rank */
+    {
+      /***************** First communication stage ************************************************/
+      /* Perform a blocking receive from the bottom (rank-1) process */
+      MPI_Recv(&buffer[start-1][1], n, MPI_MyDatatype, nbrbottom, 0, comm1d, MPI_STATUS_IGNORE);
+
+      /* Perform a blocking send to the top (rank+1) process */
+      MPI_Send(&buffer[end][1], n, MPI_MyDatatype, nbrtop, 0, comm1d);
+      /********************************************************************************************/
+
+      /**************** Second communication stage ************************************************/
+      /* Perform a blocking receive from the top (rank+1) process */
+      MPI_Recv(&buffer[end+1][1], n, MPI_MyDatatype, nbrtop, 1, comm1d, MPI_STATUS_IGNORE);
+
+      /* - Perform a blocking send to the bottom (rank-1) process */
+      MPI_Send(&buffer[start][1], n, MPI_MyDatatype, nbrbottom, 1, comm1d);
+      /********************************************************************************************/      
+    }
+
+  return;
+}
+
+/* ********************************************************************* */
+
+void get_domains(MyData  **const buffer,
+		 myDomain *const domain,
+		 const int       grid_dim,
+		 const int       rank,
+		 const int       nranks,
+		 const MPI_Comm  comm)
+{
+  /* Master process gathers all the domains */
+  
+  /***************************** get the domain boundaries from each process *************************************/
+  myDomain *boundaries = NULL;
+  if (rank == MASTER)
+    {
+      boundaries = (myDomain *)malloc(nranks * sizeof(*boundaries));
+      if (boundaries == NULL)
+	{
+	  MPI_Abort(comm, EXIT_FAILURE);
+	  exit(EXIT_FAILURE);
+	}
+      boundaries[MASTER].start = domain->start;
+      boundaries[MASTER].end   = domain->end;
+    }
+
+  for (int task=0 ; task<nranks ; task++)
+    {
+      if (rank == MASTER)
+	{
+	  if (task)
+	    {
+	      MPI_Status status;
+	      MPI_Recv((void *)&boundaries[task], sizeof(myDomain), MPI_BYTE, task, 0, comm, &status);
+	      if ((status.MPI_ERROR != MPI_SUCCESS) || (status.MPI_SOURCE != task))
+		{
+		  free(boundaries);
+		  MPI_Abort(comm, EXIT_FAILURE);
+		  exit(EXIT_FAILURE);
+		}
+	    }
+#if defined(DEBUG)
+	  printf("\n\t Diplacements[%d].start = %d - Diplacements[%d].end = %d",
+		 task, boundaries[task].start, task, boundaries[task].end);
+#endif /* DEBUG */	  
+	} /* MASTER */
+      else if (task == rank)
+	{
+	  MPI_Send((void *)domain, sizeof(myDomain), MPI_BYTE, MASTER, 0, comm);
+	}
+    } /* loop over nranks */
+
+#if defined(DEBUG)
+  if (rank == MASTER)
+    {
+      printf("\n");
+      fflush(stdout);
+    }
+#endif /* DEBUG */
+
+  /**************************************************************************************************/
+
+  /***************************************** get the domain from each process *************************/
+
+  for (int task=0 ; task<nranks ; task++)
+    {
+      if (rank == MASTER)
+	{
+	  if (task)
+	    {
+	      MPI_Status status;
+	      /* number of grid points to receive (including ghost points) */
+	      const int nrows = (boundaries[task].end - boundaries[task].start + 1);
+	      const int elements = (nrows * (grid_dim + 2));
+	      MPI_Recv((void *)&buffer[boundaries[task].start][0], elements, MPI_MyDatatype, task, 0, comm, &status);
+	      if ((status.MPI_ERROR != MPI_SUCCESS) || (status.MPI_SOURCE != task))
+		{
+		  free(boundaries);
+		  MPI_Abort(comm, EXIT_FAILURE);
+		  exit(EXIT_FAILURE);
+		}
+	    }
+	} /* MASTER */
+      else if (task == rank)
+	{
+	  const int nrows = (domain->end - domain->start + 1);
+	  const int elements = (nrows * (grid_dim + 2));
+	  MPI_Send((void *)&buffer[domain->start][0], elements, MPI_MyDatatype, MASTER, 0, comm);
+	}
+    } /* loop over nranks */  
+  
+  /***************************************************************************************************/
+  
+  if (rank == MASTER)
+    free(boundaries);
+  
+  return;
+}
+
+/* ********************************************************************* */
+void WriteSolution(MyData **const phi,
+		   const int      nx,
+		   const int      ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+    
+  static int nfile = 0;  /* File counter */
+
+  char fname[32];
+  sprintf(fname,"jacobi2D_mpi_send_recv_paired_%02d.bin", nfile);
+    
+  FILE *fp;
+  printf ("> Writing %s\n",fname);
+  fp = fopen(fname, "wb");
+
+  /* discard boundaies */
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      fwrite (phi[j] + ibeg, sizeof(MyData), nx, fp);
+    }
+    
+  nfile++;
+  fclose(fp);
+}
diff --git a/jacobi/mpi/Send_Recv_paired/src/tools.c b/jacobi/mpi/Send_Recv_paired/src/tools.c
new file mode 100644
index 0000000000000000000000000000000000000000..be2761d170c23fde4d5d4aeda54af00e56abf3ce
--- /dev/null
+++ b/jacobi/mpi/Send_Recv_paired/src/tools.c
@@ -0,0 +1,59 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "tools.h"
+
+/* ********************************************************************* */
+MyData **Allocate_2DdblArray(const int nx, const int ny)
+/*
+ * Allocate memory for a double precision array with
+ * nx rows and ny columns
+ *********************************************************************** */
+{
+  MyData **buf = malloc(nx * sizeof(MyData *));
+  assert(buf != NULL);
+
+  buf[0] = (MyData *) malloc(nx * ny * sizeof(MyData));
+  assert(buf[0] != NULL);
+  
+  for (int j=1 ; j<nx ; j++)
+    buf[j] = buf[j-1] + ny;
+    
+  return buf;
+}
+
+/* ********************************************************************* */
+void Show_2DdblArray(const MyData **const A,
+		     const int            nx,
+		     const int            ny,
+		     const char *const    string)
+/* *********************************************************************** */
+{
+  printf ("%s\n",string);
+  printf ("------------------------------\n");
+  for (int i=0 ; i<nx ; i++)
+    {
+      for (int j=0 ; j<ny ; j++)
+	{
+	  printf ("%8.2f  ", A[i][j]);
+	}
+      printf ("\n");
+    }
+  printf ("------------------------------\n");
+}
+/* ********************************************************************* */
+
+double seconds()
+{
+  struct timeval tmp;
+  gettimeofday(&tmp, (struct timezone *)0);
+  double sec = tmp.tv_sec + ((double)tmp.tv_usec)/1000000.0;
+
+  return sec;
+}
diff --git a/jacobi/mpi/cartesian/Makefile b/jacobi/mpi/cartesian/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..02695b48106ca217759865cd3708f7ffe738fb1c
--- /dev/null
+++ b/jacobi/mpi/cartesian/Makefile
@@ -0,0 +1,57 @@
+#######################################################################
+# Author: David Goz (david.goz@inaf.it)                               #
+# June  2024                                                          #
+#######################################################################
+#
+# To see all the compilation options
+# $ make info
+#######################################################################
+
+# make.def defines how the application is compiled
+include make.def
+include make_mpi_path
+#######################################################################
+
+.PHONY: info mpi valgrind_memcheck valgrind_callgrind valgrind_cachegrind clean
+
+info:
+	@echo ' '
+	@echo '-----------------------------------------------------------------------------------------'
+	@echo '$$ make                     ---> compile the mpi application                             '
+	@echo '$$ make valgrind_memcheck   ---> run the mpi application using Valgrind under Memcheck   '
+	@echo '$$ make valgrind_callgrind  ---> run the mpi application using Valgrind under Callgrind  '
+	@echo '$$ make valgrind_cachegrind ---> run the mpi application using Valgrind under Cachegrind '
+	@echo '$$ make clean               ---> clean up all                                            '
+	@echo '$$ make info                ---> get make info                                           '
+	@echo '-----------------------------------------------------------------------------------------'
+	@echo ' '
+
+mpi: $(PROG)
+
+valgrind_memcheck: $(PROG_MEMCHECK)
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+	mpirun -n 4 valgrind --tool=memcheck -s --default-suppressions=yes --log-file=valgrind_memcheck_log_%p.txt ./$< 9 2
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+
+valgrind_callgrind: $(PROG_CALLGRIND)
+	@echo 'oooOOO... valgrind_callgrind ...OOOooo'
+	mpirun -n 4 valgrind --tool=callgrind --dump-instr=yes --collect-jumps=yes --log-file=valgrind_callgrind_log_.%p.txt ./$< 9 2
+	@echo ' '
+	@echo 'To generate a function-by-function summary from the profile data file:'
+	@echo '$$ callgrind_annotate --auto=yes callgrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+
+valgrind_cachegrind: $(PROG_CACHEGRIND)
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+	mpirun -n 4 valgrind --tool=cachegrind --log-file=valgrind_cachegrind_log_.%p.txt ./$< 9 2
+	@echo '$$ cg_annotate --auto=yes cachegrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+
+clean:
+	rm -f *~ .*~ ./src/*~ ./src/*# ./include/*~ ./include/*# *~
+	rm -f $(PROG) $(PROG_DEBUG) $(PROG_MEMCHECK) $(PROG_CALLGRIND) $(PROG_CACHEGRIND)
+	rm -f valgrind_*.txt
+	rm -f cachegrind.out.*
+	rm -f callgrind.*
+	rm -f *bin
diff --git a/jacobi/mpi/cartesian/include/allvars.h b/jacobi/mpi/cartesian/include/allvars.h
new file mode 100644
index 0000000000000000000000000000000000000000..798e4e6a2f2a740b3ab5a1394ac85eef8da1255c
--- /dev/null
+++ b/jacobi/mpi/cartesian/include/allvars.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#define NGHOST   1
+#define NDIM     2 /* 2D */
+#define X        0
+#define Y        1
+#define TOL      1.e-5
+
+#include <mpi.h>
+
+#define MASTER 0
+
+#if defined(SINGLE_PRECISION)
+typedef float MyData;
+#define MPI_MyDatatype MPI_FLOAT_PRECISION
+#else
+/* double precision is assumed by default */
+typedef double MyData;
+#define MPI_MyDatatype MPI_DOUBLE_PRECISION
+#endif /* SINGLE_PRECISION */
diff --git a/jacobi/mpi/cartesian/include/tools.h b/jacobi/mpi/cartesian/include/tools.h
new file mode 100644
index 0000000000000000000000000000000000000000..82859ec511710474547355ede3fc38cf33a92154
--- /dev/null
+++ b/jacobi/mpi/cartesian/include/tools.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "allvars.h"
+#include <assert.h>
+#include <sys/time.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/* function prototypes */
+MyData **Allocate_2DdblArray(const int nx, const int ny);
+void Show_2DdblArray(const MyData **const A,
+                     const int            nx,
+                     const int            ny,
+                     const char *const    string);
+
+double seconds(void);
diff --git a/jacobi/mpi/cartesian/make.def b/jacobi/mpi/cartesian/make.def
new file mode 100644
index 0000000000000000000000000000000000000000..9b011868551a84d967d65b3d5b9ab4cab794e61e
--- /dev/null
+++ b/jacobi/mpi/cartesian/make.def
@@ -0,0 +1,45 @@
+CC     ?= gcc
+CFLAGS ?= -Wall -Wextra -march=native
+LIBS   ?= -lm -lmpi
+
+SYSTYPE = $(strip $(shell uname -n))
+
+PROG           ?= jacobi_mpi_cartesian_$(SYSTYPE)
+PROG_DEBUG      = $(PROG)_DEBUG
+PROG_MEMCHECK   = $(PROG)_MEMCHECK
+PROG_CALLGRIND  = $(PROG)_CALLGRIND
+PROG_CACHEGRIND = $(PROG)_CACHEGRIND
+
+HEADERS       = $(wildcard ./include/*.h)
+SOURCES       = $(wildcard ./src/*.c)
+DEPENDENCIES  = $(SOURCES) $(HEADERS) Makefile
+
+$(PROG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_DEBUG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -ggdb3 -fno-omit-frame-pointer -I./include I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_DEBUG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_MEMCHECK): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_MEMCHECK) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CALLGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CALLGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CACHEGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 -I./include -I$(MPI_INC) $(SOURCES) -o $@ -L$(MPI_LIB) $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CACHEGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
diff --git a/jacobi/mpi/cartesian/make_mpi_path b/jacobi/mpi/cartesian/make_mpi_path
new file mode 100644
index 0000000000000000000000000000000000000000..f2c3de9e0d1796034a518159f4c53a40e76919fc
--- /dev/null
+++ b/jacobi/mpi/cartesian/make_mpi_path
@@ -0,0 +1,9 @@
+# set the MPI install path
+
+MPI_INSTALL_PATH = /home/gozzilla/software/openmpi/openmpi-5.0.3
+
+
+
+MPI        = $(MPI_INSTALL_PATH)
+MPI_INC    = $(MPI)/include
+MPI_LIB    = $(MPI)/lib
\ No newline at end of file
diff --git a/jacobi/mpi/cartesian/src/jacobi_2D_mpi_cartesian.c b/jacobi/mpi/cartesian/src/jacobi_2D_mpi_cartesian.c
new file mode 100644
index 0000000000000000000000000000000000000000..826c7188482bbfe40983df8f4098cfdfa9297d35
--- /dev/null
+++ b/jacobi/mpi/cartesian/src/jacobi_2D_mpi_cartesian.c
@@ -0,0 +1,626 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "allvars.h"
+#include "tools.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+/* #define DEBUG */
+
+typedef struct MyGrid
+{
+  int start[NDIM];
+  int end[NDIM];
+} myDomain;
+
+/* function prototypes */
+void BoundaryConditions(MyData **const restrict,
+			MyData  *const restrict,
+			MyData  *const restrict,
+			const int,
+			const int);
+
+void JacobiAlgorithm(MyData **const restrict, MyData **const restrict, MyData *const restrict,
+		     const MyData *const restrict, const int, const int, const int, const int);
+
+void get_domains(MyData  **const buffer,
+		 myDomain *const domain,
+		 const MPI_Comm  comm);
+
+void mpi_exchange_cartesian(MyData **const  buffer,
+			    const int       nbrtop,
+			    const int       nbrbottom,
+			    const int       nbrleft,
+			    const int       nbrright,
+			    myDomain *const domain,
+			    const int       grid_y,
+			    const MPI_Comm  comm2d);
+
+void WriteSolution(MyData **const phi, const int nx, const int ny);
+
+void copy_grids(MyData **const restrict A,
+                MyData **const restrict B,
+                const int               xbeg,
+                const int               xend,
+                const int               ybeg,
+		const int               yend);
+
+int main(int argc, char **argv)
+{
+  int rank, Nranks;
+  const MPI_Comm comm = MPI_COMM_WORLD;
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &Nranks);
+
+  if ((argc <= 2) && (!rank))
+    {
+      printf("\n\t Usage: <executable> <grid_size> <cartesian communicator size_X>\n\n");
+      MPI_Abort(comm, EXIT_FAILURE);
+      exit(EXIT_FAILURE);
+    }
+
+  /* global X and Y grid size (square matrix supposed) */
+  const int NX_GLOB = (int) strtol(argv[1], NULL, 10);
+  const int NY_GLOB = NX_GLOB;
+
+  /********************************** Cartesin topology *******************************************************/
+  const int cartesian_grid_x = (int)strtol(argv[2], NULL, 10);
+  const int cartesian_grid_y = ((Nranks % cartesian_grid_x == 0) ? (Nranks / cartesian_grid_x) : -1);
+  if (cartesian_grid_y == -1)
+    {
+      if (!rank)
+        {
+          printf("\n\t Nranks mod cartesian_grid_x != 0 ... aborting ...\n");
+          MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
+          exit(EXIT_FAILURE);
+        }
+    }
+  
+  const int dims[NDIM] = {cartesian_grid_x, cartesian_grid_y};
+  const int periods[NDIM] = {0 ,0}; /* not periodic */
+  const int reorder = 0;            /* setting the logical value reorder to TRUE allows MPI to */
+                                    /* reorder the rank of the processes figuring out the      */
+                                    /* neighbors in the actual hardware for better performance */
+
+  /* make a new communicator to which topology information has been attached */
+  MPI_Comm comm2d = MPI_COMM_NULL;
+  MPI_Cart_create(MPI_COMM_WORLD, NDIM, dims, periods, reorder, &comm2d);
+  if (comm2d == MPI_COMM_NULL)
+    {
+      printf("\n\t Process %d is not part of the new communicator \n", rank);
+      fflush(stdout);
+      MPI_Abort(comm, EXIT_FAILURE);
+      exit(EXIT_FAILURE);
+    }
+
+  /* determine the process coords in cartesian topology given rank in group */
+  int coords[NDIM];
+  MPI_Cart_coords(comm2d, rank, NDIM, coords);
+
+  /* get bottom and top neighbors (X direction) */
+  int nbrtop, nbrbottom;
+  MPI_Cart_shift(comm2d, X, 1, &nbrbottom, &nbrtop);
+
+  /* get left and right neighbors (Y direction) */
+  int nbrright, nbrleft;
+  MPI_Cart_shift(comm2d, Y, 1, &nbrleft, &nbrright);
+  
+  /************************************************************************************************************/
+  /* 2D MPI-cartesian decomposition:
+     the grids are replicated across the MPI processes.
+     This approach is not the most efficient in terms of memory usage,
+     because the arrays are replicated across MPI process instead of to be distributed */
+
+  /* get the reminder, i.e. take into account uneven
+     decomposition of points among the processes    */
+  const int rem[NDIM] = {(NX_GLOB + 2) % dims[X],
+                         (NY_GLOB + 2) % dims[Y]};
+
+  /* get the amount of data for each MPI process:
+     - chunk is supposed to be >= 1 */
+  const int chunk[NDIM] = {(NX_GLOB + 2 - rem[X]) / dims[X],
+                           (NY_GLOB + 2 - rem[Y]) / dims[Y]};
+  if ((chunk[X] < 1) || (chunk[Y] < 1))
+    {
+      printf("\n\t chunk[X] < 1 || chunk[Y] < 1 ... aborting ...[rank: %d]\n", rank);
+      MPI_Comm_free(&comm2d);
+      MPI_Abort(comm, EXIT_FAILURE);
+      exit(EXIT_FAILURE);
+    }
+  
+  /* get the subgrid dimension along X and Y directions: */
+  int incr[NDIM], offset[NDIM];
+  for (int dim=0 ; dim<NDIM ; dim++)
+    {
+      if (coords[dim] < rem[dim])
+	{
+	  incr[dim] = chunk[dim] + 1;
+	  offset[dim] = 0;
+	}
+      else
+	{
+	  incr[dim] = chunk[dim];
+	  offset[dim] = rem[dim];
+	}
+    }
+
+  /* subdomain managed by the task */
+  myDomain domain;
+  for (int dim=0 ; dim<NDIM ; dim++)
+    {
+      domain.start[dim] = ((coords[dim] * incr[dim]) + offset[dim]);
+      domain.end[dim]   = (domain.start[dim] + incr[dim]) - 1;
+
+      /* boundaries */
+      domain.start[dim] = ((domain.start[dim] == 0)         ? NGHOST  : domain.start[dim]);
+      domain.end[dim]   = ((domain.end[dim] == NX_GLOB + 1) ? NX_GLOB : domain.end[dim]);
+    }
+
+#if defined(DEBUG)
+  for (int task=0 ; task<Nranks ; task++)
+    {
+      MPI_Barrier(comm);
+      
+      if (task == rank)
+	{
+	  printf("\n\t rank: %d", rank);
+	  printf("\n\t\t coords = [%d, %d]", coords[X], coords[Y]);
+	  printf("\n\t\t domain.start[X] = %d - domain.end[X] = %d", domain.start[X], domain.end[X]);
+	  printf("\n\t\t domain.start[Y] = %d - domain.end[Y] = %d", domain.start[Y], domain.end[Y]);
+	  printf("\n\t\t nbrtop  : %d - nbrbottom: %d", nbrtop, nbrbottom);
+	  printf("\n\t\t nbrright: %d - nbrleft  : %d \n", nbrright, nbrleft);
+	  fflush(stdout);
+	}
+    }
+
+  MPI_Barrier(comm);
+#endif /* DEBUG */
+  
+  /******************************************************************************************************/
+  
+  const MyData xbeg = 0.0;
+  const MyData xend = 1.0;
+  const MyData ybeg = 0.0;
+  const MyData yend = 1.0;
+
+  const MyData delta[NDIM] = {(xend - xbeg)/(NX_GLOB + 1),
+			      (yend - ybeg)/(NY_GLOB + 1)};
+
+  /* --------------------------------------------------------
+     1. Set grid indices
+     -------------------------------------------------------- */
+    
+  const int ibeg   = NGHOST;
+  const int iend   = ibeg + NX_GLOB - 1;
+  const int nx     = iend - ibeg + 1;
+  const int nx_tot = nx + 2 * NGHOST;
+    
+  const int jbeg   = NGHOST;
+  const int jend   = jbeg + NY_GLOB - 1;
+  const int ny     = jend - jbeg + 1;
+  const int ny_tot = ny + 2 * NGHOST;
+
+  if (rank == MASTER)
+    {
+      printf("\n\t Grid indices:");
+      printf("\n\t\t ibeg, iend = %d, %d; nx_tot = %d"    ,ibeg, iend, nx_tot);
+      printf("\n\t\t jbeg, jend = %d, %d; ny_tot = %d\n\n",jbeg, jend, ny_tot);
+    }
+  
+  /* --------------------------------------------------------
+     2. Generate grid, allocate memory
+        Not optimized because the grids are (unnecessarily)
+	replicated across MPI processes
+     -------------------------------------------------------- */
+
+  /* memory allocation */
+  MyData *xg = (MyData *) malloc((NX_GLOB + 2*NGHOST) * sizeof(MyData));
+  MyData *yg = (MyData *) malloc((NY_GLOB + 2*NGHOST) * sizeof(MyData));
+  assert((xg != NULL) && (yg != NULL));
+
+  /* initial conditions */
+  for (int i=0 ; i<(NX_GLOB + 2*NGHOST) ; i++) xg[i] = xbeg + (i - ibeg + 1) * delta[X];
+  for (int j=0 ; j<(NY_GLOB + 2*NGHOST) ; j++) yg[j] = ybeg + (j - jbeg + 1) * delta[Y];
+  MyData *x = xg; /* Global and local grids are the same  */
+  MyData *y = yg; /* for serial version of the code       */
+
+  /* grids memory allocation */
+  MyData **phi  = Allocate_2DdblArray(ny_tot, nx_tot);
+  MyData **phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
+    
+  /* --------------------------------------------------------
+     3. Initialize solution array to 0
+     -------------------------------------------------------- */
+    
+  for (int j=jbeg ; j<=jend ; j++)
+    for (int i=ibeg ; i<=iend ; i++)
+      {
+	phi0[j][i] = 0.0;
+	phi[j][i]  = 0.0;
+      }
+  
+  /* --------------------------------------------------------
+     4. Main iteration cycle
+     -------------------------------------------------------- */
+
+  const double time_start = MPI_Wtime();
+  
+  /* -- 4a. Set boundary conditions first -- */  
+  BoundaryConditions(phi0, x, y, nx, ny);
+  BoundaryConditions(phi, x, y, nx, ny);
+
+  MyData err = 1.0;
+  /* iterations */
+  int k = 0;
+  while (1)
+    {
+      /* -- 4c. Jacobi's method and residual (interior points) -- */
+      /*       core algorithm                                     */
+
+      err = 0.0;
+      JacobiAlgorithm(phi, phi0, &err, delta,
+		      domain.start[Y], domain.end[Y],
+		      domain.start[X], domain.end[X]);
+
+      if (!rank)
+	printf("\n\t Iteration = %d - err = %lg\n",k, err);
+
+      /* increase the counter of loop iterations */
+      k++;
+
+      /* get the total error */
+      MyData toterr;
+      /* combines values from all processes and distributes the result back to all processes */
+      MPI_Allreduce(&err, &toterr, 1, MPI_MyDatatype, MPI_SUM, comm2d);
+      
+      /* check convergence */
+      if (toterr <= TOL)
+	{
+	  /* master task gathers all the domains */
+	  get_domains(phi, &domain, comm2d);
+
+	  break;
+	}
+
+      /* -- 4b. MPI communications */
+      mpi_exchange_cartesian(phi,
+			     nbrtop, nbrbottom, nbrleft, nbrright,
+			     &domain, NY_GLOB,
+			     comm2d);
+
+      /* swap the pointers */
+      MyData **tmp = phi;
+      phi = phi0;
+      phi0 = tmp;
+    }
+
+  /* master rank writes the solution */
+  if (!rank)
+    {
+      WriteSolution(phi, nx, ny);
+      
+      printf("\n\t NX_GLOB x NY_GLOB = %d x %d\n", NX_GLOB, NY_GLOB);
+      printf("\n\t Time = %lf [s]\n\n", MPI_Wtime() - time_start);
+    }
+
+  // free memory
+  if (phi0)
+    {
+      free(phi0[0]);
+      free(phi0);
+    }
+
+  if (phi)
+    {
+      free(phi[0]);
+      free(phi);
+    }
+
+  if (yg)
+    free(yg);
+  
+  if (xg)
+    free(xg);
+
+  MPI_Comm_free(&comm2d);
+  
+  MPI_Finalize();
+  
+  return 0;
+}
+
+/* ********************************************************************* */
+void BoundaryConditions(MyData **const restrict phi,
+			MyData  *const restrict x,
+			MyData  *const restrict y,
+                        const int               nx,
+			const int               ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;
+  const int iend = ibeg + nx - 1;
+    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+
+  int i,j;
+  
+  /* -- Left -- */
+  i = ibeg - 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (1.0 - y[j]);
+    
+  /* -- Right -- */
+  i = jend + 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (y[j] * y[j]);
+    
+  /* -- Bottom -- */    
+  j = jbeg - 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = (1.0 - x[i]);
+    
+  /* -- Top -- */
+  j = jend + 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = x[i];
+
+  return;
+}
+
+/* ********************************************************************* */
+
+void JacobiAlgorithm(MyData      **const restrict Phi,
+		     MyData      **const restrict Phi0,
+		     MyData       *const restrict error,
+		     const MyData *const restrict delta,
+		     const int                    ibeg,
+		     const int                    iend,
+		     const int                    jbeg,
+		     const int                    jend)
+{
+  *error = 0.0;
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      for (int i=ibeg ; i<=iend ; i++)
+	{
+	  Phi[j][i] = 0.25 * (Phi0[j][i-1] + Phi0[j][i+1] +
+			      Phi0[j-1][i] + Phi0[j+1][i]);
+                
+	  *error += delta[X] * delta[Y] * fabs(Phi[j][i] - Phi0[j][i]);
+	} /* loop over columns */
+    } /* loop over rows */
+  
+  return;
+}
+
+/* ********************************************************************* */
+void mpi_exchange_cartesian(MyData **const  buffer,
+			    const int       nbrtop,
+			    const int       nbrbottom,
+			    const int       nbrleft,
+			    const int       nbrright,
+			    myDomain *const domain,
+			    const int       grid_y,
+			    const MPI_Comm  comm2d)
+{
+ /* In the data exchange 1D routines seen so far, buffers are contiguous areas in memory,
+    so that, for example, the datatypes MPI_DOUBLE_PRECISION coupled with the count of
+    occurrences are sufficient to describe the buffer to be sent. 
+    MPI derived datatypes allow the programmer to specify non-contiguous areas in memory, 
+    such as a column of an array stored in a row-major order like in C programming language */
+  
+  /* Creates a vector (strided) datatype:                                              */
+  /* - 1st argument (count)      : number of blocks;                                   */
+  /* - 2nd argument (blocklength): number of elements in each block;                   */
+  /* - 3rd argument (stride)     : number of elements between the start of each block; */
+  /* - 4th argument (oldtype)    : old datatype;                                       */
+  /* - 5th argument (newtype)    : new datatype                                        */
+
+  /* count = (x_end - x_start + 1), i.e. the size of each column                       */
+  /* blocklength = 1, i.e. one double (old datatype) in each block                     */
+  /* stride = NY_GLOB + 2, i.e. ghost points are discarded (grid is replicated)        */
+  /* oldtype = MPI_MyDatatype                                                          */
+  
+  MPI_Datatype column;
+  MPI_Type_vector((domain->end[X] - domain->start[X] + 1), 1, grid_y + 2, MPI_MyDatatype, &column);
+  /* commits the datatype */
+  MPI_Type_commit(&column);
+
+  /* communication with nbrtop and nbrbottom */
+  const int data_row_size = (domain->end[Y] - domain->start[Y] + 1);
+  MPI_Sendrecv(&buffer[domain->end[X]][domain->start[Y]]    , data_row_size, MPI_MyDatatype, nbrtop,    0,
+	       &buffer[domain->start[X]-1][domain->start[Y]], data_row_size, MPI_MyDatatype, nbrbottom, 0,
+	       comm2d, MPI_STATUS_IGNORE);
+
+  MPI_Sendrecv(&buffer[domain->start[X]][domain->start[Y]], data_row_size, MPI_MyDatatype, nbrbottom, 1,
+	       &buffer[domain->end[X]+1][domain->start[Y]], data_row_size, MPI_MyDatatype, nbrtop,    1,
+	       comm2d, MPI_STATUS_IGNORE);
+
+  /* communication with nbrleft, nbrright - vector datatype is used */
+  MPI_Sendrecv(&buffer[domain->start[X]][domain->end[Y]],     1, column, nbrright, 0,
+	       &buffer[domain->start[X]][domain->start[Y]-1], 1, column, nbrleft,  0,
+	       comm2d, MPI_STATUS_IGNORE);
+
+  MPI_Sendrecv(&buffer[domain->start[X]][domain->start[Y]], 1, column, nbrleft, 1,
+	       &buffer[domain->start[X]][domain->end[Y]+1], 1, column, nbrright, 1,
+	       comm2d, MPI_STATUS_IGNORE);  
+  
+  /* when the datatype is no longer needed, it should be freed, */
+  /* i.e. datatype is set to MPI_TYPE_NULL                      */
+  MPI_Type_free(&column);
+  
+  return;
+}
+
+/* ********************************************************************* */
+
+void get_domains(MyData  **const buffer,
+		 myDomain *const domain,
+		 const MPI_Comm  comm)
+{
+  int rank, nranks;
+  MPI_Comm_size(comm, &nranks);
+  MPI_Comm_rank(comm, &rank);
+  
+  /* Master process gathers all the domains */
+  
+  /***************************** get the domain boundaries from each process *************************************/
+  myDomain *boundaries = NULL;
+  if (rank == MASTER)
+    {
+      boundaries = (myDomain *)malloc(nranks * sizeof(*boundaries));
+      assert(boundaries != NULL);
+    }
+
+  MPI_Gather(domain,     sizeof(*domain), MPI_BYTE,
+	     boundaries, sizeof(*domain), MPI_BYTE,
+	     MASTER, comm);
+
+#if defined(DEBUG)
+  if (rank == MASTER)
+    {
+      for (int task=0 ; task<nranks ; task++)
+	{
+	  printf("\n\t Task: %d", task);
+	  printf("\n\t\t boundaries[%d].start[X] = %d - boundaries[%d].end[X] = %d",
+		 task, boundaries[task].start[X], task, boundaries[task].end[X]);
+	  printf("\n\t\t boundaries[%d].start[Y] = %d - boundaries[%d].end[Y] = %d\n",
+		 task, boundaries[task].start[Y], task, boundaries[task].end[Y]);
+	}
+      printf("\n");
+      fflush(stdout);
+    }
+#endif /* DEBUG */
+
+  /**************************************************************************************************/
+  
+  /***************************************** get the domain from each process *************************/
+  /* MPI_Gatherv
+     Gathers into specified locations from all processes in a group
+
+     Synopsis
+     int MPI_Gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                     void *recvbuf, const int recvcounts[], const int displs[],
+                     MPI_Datatype recvtype, int root, MPI_Comm comm) */
+
+  int *recvcounts = NULL;
+  int *displs = NULL;
+  MyData *recvbuf = NULL;
+  MyData *sendbuf = NULL;
+  
+  if (rank == MASTER)
+    {
+      recvcounts = (int *)malloc(nranks * sizeof(*recvcounts));
+      assert(recvcounts != NULL);
+
+      displs = (int *)malloc(nranks * sizeof(*displs));
+      assert(displs != NULL);
+
+      for (int task=0 ; task<nranks ; task++)
+	{
+	  const int nrows = (boundaries[task].end[X] - boundaries[task].start[X] + 1);
+	  const int ncols = (boundaries[task].end[Y] - boundaries[task].start[Y] + 1);
+	  recvcounts[task] = (nrows * ncols);
+	}
+
+      displs[0] = 0;
+      for (int task=1 ; task<nranks ; task++)
+	displs[task] = (recvcounts[task - 1] + displs[task - 1]);
+
+      /* allocate the recvbuffer */
+      recvbuf = (MyData *)malloc((displs[nranks - 1] + recvcounts[nranks - 1]) * sizeof(*recvbuf));
+      assert(recvbuf != NULL);
+      
+#if defined(DEBUG)
+      printf("\n\t********************** recvcounts - displs *************************");
+      for (int task=0 ; task<nranks ; task++)
+	{
+	  printf("\n\t recvcounts[%d] = %d - displs[%d] = %d",
+		 task, recvcounts[task], task, displs[task]);
+	}
+      printf("\n\t********************************************************************\n");
+      fflush(stdout);
+#endif /* DEBUG */
+
+    } /* MASTER */
+
+  /* every process sets up the sendbuf */
+  const int nrows = (domain->end[X] - domain->start[X] + 1);
+  const int ncols = (domain->end[Y] - domain->start[Y] + 1);
+  sendbuf = (MyData *)malloc(nrows * ncols * sizeof(*sendbuf));
+  assert(sendbuf != NULL);
+  /* store the actual grid owned by each process into sendbuf */
+  for (int row=0 ; row<nrows ; row++)
+    memcpy(&sendbuf[row * ncols], &buffer[domain->start[X]+row][domain->start[Y]], (ncols * sizeof(MyData)));
+  
+  /* perform the MPI collective */
+  MPI_Gatherv(sendbuf, (nrows * ncols), MPI_MyDatatype,
+	      recvbuf, recvcounts, displs, MPI_MyDatatype,
+	      MASTER, comm);
+
+  /* store the recvbuf into the actual grid owned by the MASTER task */
+  if (rank == MASTER)
+    {
+      for (int task=1 ; task<nranks ; task++)
+	{
+	  const int nrows = (boundaries[task].end[X] - boundaries[task].start[X] + 1);
+	  const int ncols = (boundaries[task].end[Y] - boundaries[task].start[Y] + 1);
+	  for (int row=0 ; row<nrows ; row++)
+	    memcpy(&buffer[boundaries[task].start[X] + row][boundaries[task].start[Y]],
+		   &recvbuf[displs[task] + (row * ncols)], (ncols * sizeof(MyData)));
+	}
+    }
+
+  free(sendbuf);
+  if (rank == MASTER)
+    {
+      free(recvbuf);
+      free(displs);
+      free(recvcounts);
+      free(boundaries);
+    }
+  
+  return;
+}
+
+/* ********************************************************************* */
+void WriteSolution(MyData **const phi,
+		   const int      nx,
+		   const int      ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+    
+  static int nfile = 0;  /* File counter */
+
+  char fname[128];
+  sprintf(fname,"jacobi2D_mpi_cartesian_%02d.bin", nfile);
+    
+  FILE *fp;
+  printf ("> Writing %s\n",fname);
+  fp = fopen(fname, "wb");
+
+  /* discard boundaies */
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      fwrite (phi[j] + ibeg, sizeof(MyData), nx, fp);
+    }
+    
+  nfile++;
+  fclose(fp);
+}
diff --git a/jacobi/mpi/cartesian/src/tools.c b/jacobi/mpi/cartesian/src/tools.c
new file mode 100644
index 0000000000000000000000000000000000000000..be2761d170c23fde4d5d4aeda54af00e56abf3ce
--- /dev/null
+++ b/jacobi/mpi/cartesian/src/tools.c
@@ -0,0 +1,59 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "tools.h"
+
+/* ********************************************************************* */
+MyData **Allocate_2DdblArray(const int nx, const int ny)
+/*
+ * Allocate memory for a double precision array with
+ * nx rows and ny columns
+ *********************************************************************** */
+{
+  MyData **buf = malloc(nx * sizeof(MyData *));
+  assert(buf != NULL);
+
+  buf[0] = (MyData *) malloc(nx * ny * sizeof(MyData));
+  assert(buf[0] != NULL);
+  
+  for (int j=1 ; j<nx ; j++)
+    buf[j] = buf[j-1] + ny;
+    
+  return buf;
+}
+
+/* ********************************************************************* */
+void Show_2DdblArray(const MyData **const A,
+		     const int            nx,
+		     const int            ny,
+		     const char *const    string)
+/* *********************************************************************** */
+{
+  printf ("%s\n",string);
+  printf ("------------------------------\n");
+  for (int i=0 ; i<nx ; i++)
+    {
+      for (int j=0 ; j<ny ; j++)
+	{
+	  printf ("%8.2f  ", A[i][j]);
+	}
+      printf ("\n");
+    }
+  printf ("------------------------------\n");
+}
+/* ********************************************************************* */
+
+double seconds()
+{
+  struct timeval tmp;
+  gettimeofday(&tmp, (struct timezone *)0);
+  double sec = tmp.tv_sec + ((double)tmp.tv_usec)/1000000.0;
+
+  return sec;
+}
diff --git a/jacobi/mpi/miscellaneous/cartesian.cpp b/jacobi/mpi/miscellaneous/cartesian.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c4a97031a79e7db5cdda3bc42efe3f3afdc7e39c
--- /dev/null
+++ b/jacobi/mpi/miscellaneous/cartesian.cpp
@@ -0,0 +1,71 @@
+#include <mpi.h>
+#include <iostream>
+
+#define SIZE 2
+#define X    0
+#define Y    1
+
+int main(int argc, char **argv)
+{
+  using namespace std;
+  
+  int task, ntasks;
+  MPI_Init(&argc, &argv);
+  MPI_Comm_size(MPI_COMM_WORLD, &ntasks);
+  MPI_Comm_rank(MPI_COMM_WORLD, &task);
+
+  if (argc < 1)
+    {
+      if (!task)
+	{
+	  cout << "\n\t Usage: <executable> <number processes along X> \n" << endl;
+	  MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
+	  exit(EXIT_FAILURE);
+	}
+    }
+
+  const int cartesian_grid_x = (int)strtol(argv[1], NULL, 10);
+  const int cartesian_grid_y = ((ntasks % cartesian_grid_x == 0) ? (ntasks / cartesian_grid_x) : -1);
+  if (cartesian_grid_y == -1)
+    {
+      if (!task)
+	{
+	  cout << "\n\t ntasks % cartesian_grid_x != 0 ... aborting ...\n" << endl;
+	  MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
+	  exit(EXIT_FAILURE);
+	}
+    }
+  
+  static int dims[SIZE] = {cartesian_grid_x, cartesian_grid_y};
+  static int periods[SIZE] = {0, 0};
+  static int reorder = 0;
+  MPI_Comm comm2d;
+  MPI_Cart_create(MPI_COMM_WORLD, SIZE, dims, periods, reorder, &comm2d);
+
+  int coords[SIZE];
+  MPI_Cart_coords(comm2d, task, SIZE, coords);
+
+  int nbrright, nbrleft;
+  MPI_Cart_shift(comm2d, Y, 1, &nbrleft, &nbrright);
+
+  int nbrtop, nbrbottom;
+  MPI_Cart_shift(comm2d, X, 1, &nbrbottom, &nbrtop);
+
+  for (int rank=0 ; rank<ntasks; rank++)
+    {
+      MPI_Barrier(MPI_COMM_WORLD);
+      if (rank == task)
+	{
+	  cout << "\n\t Task: " << task << endl;
+	  cout << "\t\t coords[" << coords[X] << "," << coords[Y] << "]" << endl;
+	  cout << "\t\t nbrright: " << nbrright << " - nbrleft  : " << nbrleft << endl;
+	  cout << "\t\t nbrtop  : " << nbrtop   << " - nbrbottom: " << nbrbottom << endl;
+	  cout << endl;
+	}
+    }
+  
+  MPI_Comm_free(&comm2d);
+  MPI_Finalize();
+  
+  return 0;
+}
diff --git a/jacobi/openmp/not_opt/Makefile b/jacobi/openmp/not_opt/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..f1dca1984b87035b5b6a384244aaae3497018b6e
--- /dev/null
+++ b/jacobi/openmp/not_opt/Makefile
@@ -0,0 +1,65 @@
+#######################################################################
+# Author: David Goz (david.goz@inaf.it)                               #
+# June  2024                                                          #
+#######################################################################
+#
+# To see all the compilation options
+# $ make info
+#######################################################################
+
+# make.def defines how the application is compiled
+include make.def
+#######################################################################
+
+.PHONY: info serial omp valgrind_memcheck valgrind_callgrind valgrind_cachegrind debug clean 
+
+info:
+	@echo ' '
+	@echo '--------------------------------------------------------------------------------------------'
+	@echo '$$ make serial                 ---> compile the serial application                          '
+	@echo '$$ make omp                    ---> compile the omp application                             '
+	@echo '$$ make debug                  ---> debug the omp application                               '
+	@echo '$$ make valgrind_memcheck      ---> run the omp application using Valgrind under Memcheck   '
+	@echo '$$ make valgrind_callgrind     ---> run the omp application using Valgrind under Callgrind  '
+	@echo '$$ make valgrind_cachegrind    ---> run the omp application using Valgrind under Cachegrind '
+	@echo '$$ make clean                  ---> clean up all                                            '
+	@echo '$$ make info                   ---> get make info                                           '
+	@echo '--------------------------------------------------------------------------------------------'
+	@echo ' '
+
+serial: $(PROG)
+
+omp: $(PROG_OMP)
+
+debug: $(PROG_DEBUG)
+	@echo 'OOOooo... debugging ...oooOOO'
+	gdb --args ./$<
+	@echo 'OOOooo... debugging ... oooOOO'
+
+valgrind_memcheck: $(PROG_MEMCHECK)
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+	valgrind --tool=memcheck -s --leak-check=full --show-leak-kinds=all --track-origins=yes --read-var-info=yes --log-file=valgrind_memcheck_log_%p.txt ./$< 10 10 2
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+
+valgrind_callgrind: $(PROG_CALLGRIND)
+	@echo 'oooOOO... valgrind_callgrind ...OOOooo'
+	valgrind --tool=callgrind --dump-instr=yes --collect-jumps=yes --log-file=valgrind_callgrind_log_.%p.txt ./$< 128 128 2
+	@echo ' '
+	@echo 'To generate a function-by-function summary from the profile data file:'
+	@echo '$$ callgrind_annotate --auto=yes callgrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+
+valgrind_cachegrind: $(PROG_CACHEGRIND)
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+	valgrind --tool=cachegrind --log-file=valgrind_cachegrind_log_.%p.txt ./$< 128 128 2
+	@echo '$$ cg_annotate --auto=yes cachegrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+
+clean:
+	rm -f *~ .*~ ./src/*~ ./src/*# ./include/*~ ./include/*# *~
+	rm -f $(PROG) $(PROG_OMP) $(PROG_DEBUG) $(PROG_MEMCHECK) $(PROG_CALLGRIND) $(PROG_CACHEGRIND)
+	rm -f valgrind_*.txt
+	rm -f cachegrind.out.*
+	rm -f callgrind.*
+	rm -f *bin
diff --git a/jacobi/openmp/not_opt/include/allvars.h b/jacobi/openmp/not_opt/include/allvars.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ffdbdae9e5b89a38a5dc6747bbd73edecb77ba3
--- /dev/null
+++ b/jacobi/openmp/not_opt/include/allvars.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#define NGHOST   1
+#define NDIM     2 /* 2D */
+#define X        0
+#define Y        1
+#define TOL      1.e-5
+
+typedef double MyData;
+
+#if defined(_OPENMP)
+
+#include <omp.h>
+
+#else
+   
+#define omp_get_thread_num()  0
+#define omp_get_num_threads() 1
+
+#endif /* _OPENMP */
diff --git a/jacobi/openmp/not_opt/include/tools.h b/jacobi/openmp/not_opt/include/tools.h
new file mode 100644
index 0000000000000000000000000000000000000000..82859ec511710474547355ede3fc38cf33a92154
--- /dev/null
+++ b/jacobi/openmp/not_opt/include/tools.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "allvars.h"
+#include <assert.h>
+#include <sys/time.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/* function prototypes */
+MyData **Allocate_2DdblArray(const int nx, const int ny);
+void Show_2DdblArray(const MyData **const A,
+                     const int            nx,
+                     const int            ny,
+                     const char *const    string);
+
+double seconds(void);
diff --git a/jacobi/openmp/not_opt/make.def b/jacobi/openmp/not_opt/make.def
new file mode 100644
index 0000000000000000000000000000000000000000..ea4ceb7059794adfebd9bf8a956567451d1eb879
--- /dev/null
+++ b/jacobi/openmp/not_opt/make.def
@@ -0,0 +1,53 @@
+CC     ?= gcc
+CFLAGS ?= -Wall -Wextra -march=native
+OMP    ?= -fopenmp
+LIBS   ?= -lm
+
+SYSTYPE = $(strip $(shell uname -n))
+
+PROG           ?= jacobi_serial_not_opt_$(SYSTYPE)
+PROG_OMP       ?= jacobi_omp_not_opt_$(SYSTYPE)
+PROG_DEBUG      = $(PROG_OMP)_DEBUG
+PROG_MEMCHECK   = $(PROG_OMP)_MEMCHECK
+PROG_CALLGRIND  = $(PROG_OMP)_CALLGRIND
+PROG_CACHEGRIND = $(PROG_OMP)_CACHEGRIND
+
+HEADERS       = $(wildcard ./include/*.h)
+SOURCES       = $(wildcard ./src/*.c)
+DEPENDENCIES  = $(SOURCES) $(HEADERS) Makefile
+
+$(PROG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -O3 -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_OMP): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -O3 $(OMP) -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_DEBUG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -ggdb3 $(OMP) -fno-omit-frame-pointer -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_DEBUG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_MEMCHECK): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og $(OMP) -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_MEMCHECK) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CALLGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 $(OMP) -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CALLGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CACHEGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 $(OMP) -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CACHEGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
diff --git a/jacobi/openmp/not_opt/src/jacobi_2D_omp_not_opt.c b/jacobi/openmp/not_opt/src/jacobi_2D_omp_not_opt.c
new file mode 100644
index 0000000000000000000000000000000000000000..a43c5bbcb2f75e956b8933784327773c9ce94e46
--- /dev/null
+++ b/jacobi/openmp/not_opt/src/jacobi_2D_omp_not_opt.c
@@ -0,0 +1,309 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "allvars.h"
+#include "tools.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+/* function prototypes */
+void BoundaryConditions(MyData **const restrict,
+			MyData  *const restrict,
+			MyData  *const restrict,
+			const int,
+			const int);
+
+void JacobiAlgorithm(MyData **const restrict, MyData **const restrict, MyData *const restrict,
+		     const MyData *const restrict, int, int, int, int, const int);
+
+void WriteSolution(MyData **const phi, const int nx, const int ny);
+
+void copy_grids(MyData **const restrict A,
+                MyData **const restrict B,
+                const int               xbeg,
+                const int               xend,
+                const int               ybeg,
+		const int               yend);
+
+int main(int argc, char **argv)
+{
+#if defined(_OPENMP)  
+  if (argc <= 3)
+    
+    {
+      printf("\n\t Usage: <executable> <x_grid_size> <y_grid_size> <omp threads>\n\n");
+      exit(EXIT_FAILURE);
+    }
+#else
+  if (argc <= 2)    
+    {
+      printf("\n\t Usage: <executable> <x_grid_size> <y_grid_size>\n\n");
+      exit(EXIT_FAILURE);
+    }
+#endif /* _OPENMP */
+  
+  /* global X and Y grid size */
+  const int NX_GLOB = (int) strtol(argv[1], NULL, 10);
+  const int NY_GLOB = (int) strtol(argv[2], NULL, 10);
+#if defined(_OPENMP)
+  const int Nthread = (int) strtol(argv[3], NULL, 10);
+#else
+  const int Nthread = 1;
+#endif /* _OPENMP */
+  
+  const MyData xbeg = 0.0;
+  const MyData xend = 1.0;
+  const MyData ybeg = 0.0;
+  const MyData yend = 1.0;
+
+  const MyData delta[NDIM] = {(xend - xbeg)/(NX_GLOB + 1),
+			      (yend - ybeg)/(NY_GLOB + 1)};
+
+  /* --------------------------------------------------------
+     1. Set grid indices
+     -------------------------------------------------------- */
+    
+  const int ibeg   = NGHOST;
+  const int iend   = ibeg + NX_GLOB - 1;
+  const int nx     = iend - ibeg + 1;
+  const int nx_tot = nx + 2 * NGHOST;
+    
+  const int jbeg   = NGHOST;
+  const int jend   = jbeg + NY_GLOB - 1;
+  const int ny     = jend - jbeg + 1;
+  const int ny_tot = ny + 2 * NGHOST;
+
+  printf("\n\t Grid indices:");
+  printf("\n\t\t ibeg, iend = %d, %d; nx_tot = %d"    ,ibeg, iend, nx_tot);
+  printf("\n\t\t jbeg, jend = %d, %d; ny_tot = %d\n\n",jbeg, jend, ny_tot);
+    
+  /* --------------------------------------------------------
+     2. Generate grid, allocate memory
+     -------------------------------------------------------- */
+
+  /* memory allocation */
+  MyData *xg = (MyData *) malloc((NX_GLOB + 2*NGHOST) * sizeof(MyData));
+  MyData *yg = (MyData *) malloc((NY_GLOB + 2*NGHOST) * sizeof(MyData));
+  assert((xg != NULL) && (yg != NULL));
+
+  /* initial conditions */
+  for (int i=0 ; i<(NX_GLOB + 2*NGHOST) ; i++) xg[i] = xbeg + (i - ibeg + 1) * delta[X];
+  for (int j=0 ; j<(NY_GLOB + 2*NGHOST) ; j++) yg[j] = ybeg + (j - jbeg + 1) * delta[Y];
+  MyData *x = xg; /* Global and local grids are the same  */
+  MyData *y = yg; /* for serial version of the code       */
+
+  /* grids memory allocation */
+  MyData **phi  = Allocate_2DdblArray(ny_tot, nx_tot);
+  MyData **phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
+    
+  /* --------------------------------------------------------
+     3. Initialize solution array to 0
+     -------------------------------------------------------- */
+    
+  for (int j=jbeg ; j<=jend ; j++)
+    for (int i=ibeg ; i<=iend ; i++)
+      phi0[j][i] = 0.0;
+  
+  /* --------------------------------------------------------
+     4. Main iteration cycle
+     -------------------------------------------------------- */
+
+  const double time_start = seconds();
+
+  MyData err = 1.0;
+  /* iterations */
+  int k = 0;
+  while (err > TOL)
+    {        
+      /* -- 4a. Set boundary conditions first -- */
+        
+      BoundaryConditions(phi0, x, y, nx, ny);
+        
+      /* -- 4b. Jacobi's method and residual (interior points) -- */
+      /*       core algorithm                                     */
+      
+      err = 0.0;
+      JacobiAlgorithm(phi, phi0, &err, delta,
+		      ibeg, iend, jbeg, jend, Nthread);
+      
+      /* -- 4c. Copy grids --*/
+      copy_grids(phi0, phi,
+		 jbeg, jend,
+		 ibeg, jend);
+
+      printf ("\n\t Iteration = %d - err = %lg\n",k, err);
+
+      /* increase the counter of loop iterations */
+      k++;
+    }
+    
+  WriteSolution(phi, nx, ny);
+
+  printf("\n\t NX_GLOB x NY_GLOB = %d x %d\n", NX_GLOB, NY_GLOB);
+  printf("\n\t Time = %lf [s]\n\n", seconds() - time_start);
+
+  // free memory
+  if (phi0)
+    {
+      free(phi0[0]);
+      free(phi0);
+    }
+
+  if (phi)
+    {
+      free(phi[0]);
+      free(phi);
+    }
+
+  if (yg)
+    free(yg);
+  
+  if (xg)
+    free(xg);
+  
+  return 0;
+}
+
+/* ********************************************************************* */
+void BoundaryConditions(MyData **const restrict phi,
+			MyData  *const restrict x,
+			MyData  *const restrict y,
+                        const int               nx,
+			const int               ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;
+  const int iend = ibeg + nx - 1;
+    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+
+  int i,j;
+  
+  /* -- Left -- */
+  i = ibeg - 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (1.0 - y[j]);
+    
+  /* -- Right -- */
+  i = jend + 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (y[j] * y[j]);
+    
+  /* -- Bottom -- */    
+  j = jbeg - 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = (1.0 - x[i]);
+    
+  /* -- Top -- */
+  j = jend + 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = x[i];
+
+  return;
+}
+
+/* ********************************************************************* */
+
+void JacobiAlgorithm(MyData      **const restrict Phi,
+		     MyData      **const restrict Phi0,
+		     MyData       *const restrict error,
+		     const MyData *const restrict delta,
+		           int                    ibeg,
+		           int                    iend,
+		           int                    jbeg,
+		           int                    jend,
+		     const int                    Nthreads)
+{
+  *error = 0.0;
+
+#pragma omp parallel default(none) shared(error, delta, Phi, Phi0)      \
+                                   firstprivate(ibeg, iend, jbeg, jend) \
+                                   num_threads(Nthreads)
+  {
+#if defined(DEBUG)
+    const int id  = omp_get_thread_num();
+    const int thr = omp_get_num_threads();
+    printf("\n\t I am the omp thread %d out of %d", id, thr);
+    #pragma omp barrier
+#endif /* DEBUG */
+
+    #pragma omp for schedule(static)
+    for (int j=jbeg ; j<=jend ; j++)
+      {
+	for (int i=ibeg ; i<=iend ; i++)
+	  {
+	    Phi[j][i] = 0.25 * (Phi0[j][i-1] + Phi0[j][i+1] +
+				Phi0[j-1][i] + Phi0[j+1][i]);
+
+	    #pragma omp critical
+	    {
+	      *error += delta[X] * delta[Y] * fabs(Phi[j][i] - Phi0[j][i]);
+	    }
+	  } /* loop over columns */
+      } /* loop over rows */
+  } /* omp parallel */
+
+  return;
+}
+
+/* ********************************************************************* */
+
+/* ********************************************************************* */
+void copy_grids(MyData **const restrict A,
+		MyData **const restrict B,
+		const int               xbeg,
+		const int               xend,
+	        const int               ybeg,
+		const int               yend)
+{
+  for (int i=xbeg ; i<=xend ; i++)
+    for (int j=ybeg ; j<=yend ; j++)
+      A[i][j] = B[i][j];
+  
+  return;
+}
+/*
+************************************************************************ */
+
+
+/* ********************************************************************* */
+void WriteSolution (MyData **const phi,
+		    const int      nx,
+		    const int      ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+    
+  static int nfile = 0;  /* File counter */
+
+  char fname[32];
+#if defined(_OPENMP)
+  sprintf(fname,"jacobi2D_omp_not_opt_%02d.bin", nfile);
+#else
+  sprintf(fname,"jacobi2D_serial_not_opt_%02d.bin", nfile);
+#endif /* _OPENMP */
+  
+  FILE *fp;
+  printf ("> Writing %s\n",fname);
+  fp = fopen(fname, "wb");
+    
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      fwrite (phi[j] + ibeg, sizeof(MyData), nx, fp);
+    }
+    
+  nfile++;
+  fclose(fp);
+}
diff --git a/jacobi/openmp/not_opt/src/tools.c b/jacobi/openmp/not_opt/src/tools.c
new file mode 100644
index 0000000000000000000000000000000000000000..be2761d170c23fde4d5d4aeda54af00e56abf3ce
--- /dev/null
+++ b/jacobi/openmp/not_opt/src/tools.c
@@ -0,0 +1,59 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "tools.h"
+
+/* ********************************************************************* */
+MyData **Allocate_2DdblArray(const int nx, const int ny)
+/*
+ * Allocate memory for a double precision array with
+ * nx rows and ny columns
+ *********************************************************************** */
+{
+  MyData **buf = malloc(nx * sizeof(MyData *));
+  assert(buf != NULL);
+
+  buf[0] = (MyData *) malloc(nx * ny * sizeof(MyData));
+  assert(buf[0] != NULL);
+  
+  for (int j=1 ; j<nx ; j++)
+    buf[j] = buf[j-1] + ny;
+    
+  return buf;
+}
+
+/* ********************************************************************* */
+void Show_2DdblArray(const MyData **const A,
+		     const int            nx,
+		     const int            ny,
+		     const char *const    string)
+/* *********************************************************************** */
+{
+  printf ("%s\n",string);
+  printf ("------------------------------\n");
+  for (int i=0 ; i<nx ; i++)
+    {
+      for (int j=0 ; j<ny ; j++)
+	{
+	  printf ("%8.2f  ", A[i][j]);
+	}
+      printf ("\n");
+    }
+  printf ("------------------------------\n");
+}
+/* ********************************************************************* */
+
+double seconds()
+{
+  struct timeval tmp;
+  gettimeofday(&tmp, (struct timezone *)0);
+  double sec = tmp.tv_sec + ((double)tmp.tv_usec)/1000000.0;
+
+  return sec;
+}
diff --git a/jacobi/openmp/opt/Makefile b/jacobi/openmp/opt/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..f1dca1984b87035b5b6a384244aaae3497018b6e
--- /dev/null
+++ b/jacobi/openmp/opt/Makefile
@@ -0,0 +1,65 @@
+#######################################################################
+# Author: David Goz (david.goz@inaf.it)                               #
+# June  2024                                                          #
+#######################################################################
+#
+# To see all the compilation options
+# $ make info
+#######################################################################
+
+# make.def defines how the application is compiled
+include make.def
+#######################################################################
+
+.PHONY: info serial omp valgrind_memcheck valgrind_callgrind valgrind_cachegrind debug clean 
+
+info:
+	@echo ' '
+	@echo '--------------------------------------------------------------------------------------------'
+	@echo '$$ make serial                 ---> compile the serial application                          '
+	@echo '$$ make omp                    ---> compile the omp application                             '
+	@echo '$$ make debug                  ---> debug the omp application                               '
+	@echo '$$ make valgrind_memcheck      ---> run the omp application using Valgrind under Memcheck   '
+	@echo '$$ make valgrind_callgrind     ---> run the omp application using Valgrind under Callgrind  '
+	@echo '$$ make valgrind_cachegrind    ---> run the omp application using Valgrind under Cachegrind '
+	@echo '$$ make clean                  ---> clean up all                                            '
+	@echo '$$ make info                   ---> get make info                                           '
+	@echo '--------------------------------------------------------------------------------------------'
+	@echo ' '
+
+serial: $(PROG)
+
+omp: $(PROG_OMP)
+
+debug: $(PROG_DEBUG)
+	@echo 'OOOooo... debugging ...oooOOO'
+	gdb --args ./$<
+	@echo 'OOOooo... debugging ... oooOOO'
+
+valgrind_memcheck: $(PROG_MEMCHECK)
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+	valgrind --tool=memcheck -s --leak-check=full --show-leak-kinds=all --track-origins=yes --read-var-info=yes --log-file=valgrind_memcheck_log_%p.txt ./$< 10 10 2
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+
+valgrind_callgrind: $(PROG_CALLGRIND)
+	@echo 'oooOOO... valgrind_callgrind ...OOOooo'
+	valgrind --tool=callgrind --dump-instr=yes --collect-jumps=yes --log-file=valgrind_callgrind_log_.%p.txt ./$< 128 128 2
+	@echo ' '
+	@echo 'To generate a function-by-function summary from the profile data file:'
+	@echo '$$ callgrind_annotate --auto=yes callgrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+
+valgrind_cachegrind: $(PROG_CACHEGRIND)
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+	valgrind --tool=cachegrind --log-file=valgrind_cachegrind_log_.%p.txt ./$< 128 128 2
+	@echo '$$ cg_annotate --auto=yes cachegrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+
+clean:
+	rm -f *~ .*~ ./src/*~ ./src/*# ./include/*~ ./include/*# *~
+	rm -f $(PROG) $(PROG_OMP) $(PROG_DEBUG) $(PROG_MEMCHECK) $(PROG_CALLGRIND) $(PROG_CACHEGRIND)
+	rm -f valgrind_*.txt
+	rm -f cachegrind.out.*
+	rm -f callgrind.*
+	rm -f *bin
diff --git a/jacobi/openmp/opt/include/allvars.h b/jacobi/openmp/opt/include/allvars.h
new file mode 100644
index 0000000000000000000000000000000000000000..468de4a19bebf2c0d5c59cd24ad3aab94b8df0f6
--- /dev/null
+++ b/jacobi/openmp/opt/include/allvars.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#define NGHOST   1
+#define NDIM     2 /* 2D */
+#define X        0
+#define Y        1
+#define TOL      1.e-5
+
+typedef double MyData;
diff --git a/jacobi/openmp/opt/include/tools.h b/jacobi/openmp/opt/include/tools.h
new file mode 100644
index 0000000000000000000000000000000000000000..82859ec511710474547355ede3fc38cf33a92154
--- /dev/null
+++ b/jacobi/openmp/opt/include/tools.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "allvars.h"
+#include <assert.h>
+#include <sys/time.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/* function prototypes */
+MyData **Allocate_2DdblArray(const int nx, const int ny);
+void Show_2DdblArray(const MyData **const A,
+                     const int            nx,
+                     const int            ny,
+                     const char *const    string);
+
+double seconds(void);
diff --git a/jacobi/openmp/opt/make.def b/jacobi/openmp/opt/make.def
new file mode 100644
index 0000000000000000000000000000000000000000..a5e01e252c90e4ad677132cbfbe3b7e0a9885b1e
--- /dev/null
+++ b/jacobi/openmp/opt/make.def
@@ -0,0 +1,53 @@
+CC     ?= gcc
+CFLAGS ?= -Wall -Wextra -march=native
+OMP    ?= -fopenmp
+LIBS   ?= -lm
+
+SYSTYPE = $(strip $(shell uname -n))
+
+PROG           ?= jacobi_serial_opt_$(SYSTYPE)
+PROG_OMP       ?= jacobi_omp_opt_$(SYSTYPE)
+PROG_DEBUG      = $(PROG_OMP)_DEBUG
+PROG_MEMCHECK   = $(PROG_OMP)_MEMCHECK
+PROG_CALLGRIND  = $(PROG_OMP)_CALLGRIND
+PROG_CACHEGRIND = $(PROG_OMP)_CACHEGRIND
+
+HEADERS       = $(wildcard ./include/*.h)
+SOURCES       = $(wildcard ./src/*.c)
+DEPENDENCIES  = $(SOURCES) $(HEADERS) Makefile
+
+$(PROG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -O3 -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_OMP): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -O3 $(OMP) -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_DEBUG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -ggdb3 $(OMP) -fno-omit-frame-pointer -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_DEBUG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_MEMCHECK): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og $(OMP) -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_MEMCHECK) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CALLGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 $(OMP) -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CALLGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CACHEGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 $(OMP) -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CACHEGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
diff --git a/jacobi/openmp/opt/src/jacobi_2D_omp_opt.c b/jacobi/openmp/opt/src/jacobi_2D_omp_opt.c
new file mode 100644
index 0000000000000000000000000000000000000000..2a47ed64cf5b577ef13eb78610e14b7d3677d9eb
--- /dev/null
+++ b/jacobi/openmp/opt/src/jacobi_2D_omp_opt.c
@@ -0,0 +1,277 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "allvars.h"
+#include "tools.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+
+/* function prototypes */
+void BoundaryConditions(MyData **const restrict,
+			MyData  *const restrict,
+			MyData  *const restrict,
+			const int,
+			const int);
+
+void JacobiAlgorithm(MyData **const restrict, MyData **const restrict, MyData *const restrict,
+		     const MyData *const restrict, const int, const int, const int, const int, const int);
+
+void WriteSolution(MyData **const phi, const int nx, const int ny);
+
+int main(int argc, char **argv)
+{
+#if defined(_OPENMP)  
+  if (argc <= 3)
+    
+    {
+      printf("\n\t Usage: <executable> <x_grid_size> <y_grid_size> <omp threads>\n\n");
+      exit(EXIT_FAILURE);
+    }
+#else
+  if (argc <= 2)    
+    {
+      printf("\n\t Usage: <executable> <x_grid_size> <y_grid_size>\n\n");
+      exit(EXIT_FAILURE);
+    }
+#endif /* _OPENMP */
+
+  /* global X and Y grid size */
+  const int NX_GLOB = (int) strtol(argv[1], NULL, 10);
+  const int NY_GLOB = (int) strtol(argv[2], NULL, 10);
+#if defined(_OPENMP)
+  const int Nthread = (int) strtol(argv[3], NULL, 10);
+#else
+  const int Nthread = 1;
+#endif /* _OPENMP */
+
+  const MyData xbeg = 0.0;
+  const MyData xend = 1.0;
+  const MyData ybeg = 0.0;
+  const MyData yend = 1.0;
+
+  const MyData delta[NDIM] = {(xend - xbeg)/(NX_GLOB + 1),
+			      (yend - ybeg)/(NY_GLOB + 1)};
+
+  /* --------------------------------------------------------
+     1. Set grid indices
+     -------------------------------------------------------- */
+    
+  const int ibeg   = NGHOST;
+  const int iend   = ibeg + NX_GLOB - 1;
+  const int nx     = iend - ibeg + 1;
+  const int nx_tot = nx + 2 * NGHOST;
+    
+  const int jbeg   = NGHOST;
+  const int jend   = jbeg + NY_GLOB - 1;
+  const int ny     = jend - jbeg + 1;
+  const int ny_tot = ny + 2 * NGHOST;
+
+  printf("\n\t Grid indices:");
+  printf("\n\t\t ibeg, iend = %d, %d; nx_tot = %d"    ,ibeg, iend, nx_tot);
+  printf("\n\t\t jbeg, jend = %d, %d; ny_tot = %d\n\n",jbeg, jend, ny_tot);
+    
+  /* --------------------------------------------------------
+     2. Generate grid, allocate memory
+     -------------------------------------------------------- */
+
+  /* memory allocation */
+  MyData *xg = (MyData *) malloc((NX_GLOB + 2*NGHOST) * sizeof(MyData));
+  MyData *yg = (MyData *) malloc((NY_GLOB + 2*NGHOST) * sizeof(MyData));
+  assert((xg != NULL) && (yg != NULL));
+
+  /* initial conditions */
+  for (int i=0 ; i<(NX_GLOB + 2*NGHOST) ; i++) xg[i] = xbeg + (i - ibeg + 1) * delta[X];
+  for (int j=0 ; j<(NY_GLOB + 2*NGHOST) ; j++) yg[j] = ybeg + (j - jbeg + 1) * delta[Y];
+  MyData *x = xg; /* Global and local grids are the same  */
+  MyData *y = yg; /* for serial version of the code       */
+
+  /* grids memory allocation */
+  MyData **phi  = Allocate_2DdblArray(ny_tot, nx_tot);
+  MyData **phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
+    
+  /* --------------------------------------------------------
+     3. Initialize solution array to 0
+     -------------------------------------------------------- */
+    
+  for (int j=jbeg ; j<=jend ; j++)
+    for (int i=ibeg ; i<=iend ; i++)
+      phi0[j][i] = 0.0;
+  
+  /* --------------------------------------------------------
+     4. Main iteration cycle
+     -------------------------------------------------------- */
+
+  const double time_start = seconds();
+
+  /* -- 4a. Set boundary conditions first -- */
+  
+  BoundaryConditions(phi0, x, y, nx, ny);
+  BoundaryConditions(phi, x, y, nx, ny);
+
+  MyData err = 1.0;
+  /* iterations */
+  int k = 0;
+  while (1)
+    {        
+      /* -- 4b. Jacobi's method and residual (interior points) -- */
+      /*       core algorithm                                     */
+      
+      err = 0.0;
+      JacobiAlgorithm(phi, phi0, &err, delta,
+		      ibeg, iend, jbeg, jend, Nthread);
+
+      printf ("\n\t Iteration = %d - err = %lg\n",k, err);
+
+      /* increase the counter of loop iterations */
+      k++;
+
+      /* check convergence */
+      if (err <= TOL)
+	break;
+      
+      /* swap the pointers */
+      MyData **tmp = phi;
+      phi = phi0;
+      phi0 = tmp;
+    }
+    
+  WriteSolution(phi, nx, ny);
+
+  printf("\n\t NX_GLOB x NY_GLOB = %d x %d\n", NX_GLOB, NY_GLOB);
+  printf("\n\t Time = %lf [s]\n\n", seconds() - time_start);
+
+  // free memory
+  if (phi0)
+    {
+      free(phi0[0]);
+      free(phi0);
+    }
+
+  if (phi)
+    {
+      free(phi[0]);
+      free(phi);
+    }
+
+  if (yg)
+    free(yg);
+  
+  if (xg)
+    free(xg);
+  
+  return 0;
+}
+
+/* ********************************************************************* */
+void BoundaryConditions(MyData **const restrict phi,
+			MyData  *const restrict x,
+			MyData  *const restrict y,
+                        const int               nx,
+			const int               ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;
+  const int iend = ibeg + nx - 1;
+    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+
+  int i,j;
+  
+  /* -- Left -- */
+  i = ibeg - 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (1.0 - y[j]);
+    
+  /* -- Right -- */
+  i = jend + 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (y[j] * y[j]);
+    
+  /* -- Bottom -- */    
+  j = jbeg - 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = (1.0 - x[i]);
+    
+  /* -- Top -- */
+  j = jend + 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = x[i];
+
+  return;
+}
+
+/* ********************************************************************* */
+
+void JacobiAlgorithm(MyData      **const restrict Phi,
+		     MyData      **const restrict Phi0,
+		     MyData       *const restrict error,
+		     const MyData *const restrict delta,
+		     const int                    ibeg,
+		     const int                    iend,
+		     const int                    jbeg,
+		     const int                    jend,
+		     const int                    Nthreads)
+{
+  double err = 0.0;
+
+#pragma omp parallel for collapse(2) schedule(static) reduction(+: err) num_threads(Nthreads)
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      for (int i=ibeg ; i<=iend ; i++)
+	{
+	  Phi[j][i] = 0.25 * (Phi0[j][i-1] + Phi0[j][i+1] +
+			      Phi0[j-1][i] + Phi0[j+1][i]);
+                
+	  err += delta[X] * delta[Y] * fabs(Phi[j][i] - Phi0[j][i]);
+	} /* loop over columns */
+    } /* loop over rows */
+
+  *error = err;
+  
+  return;
+}
+
+/* ********************************************************************* */
+
+/* ********************************************************************* */
+void WriteSolution (MyData **const phi,
+		    const int      nx,
+		    const int      ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+    
+  static int nfile = 0;  /* File counter */
+
+  char fname[32];
+#if defined(_OPENMP)
+  sprintf(fname,"jacobi2D_omp_opt_%02d.bin", nfile);
+#else
+  sprintf(fname,"jacobi2D_serial_opt_%02d.bin", nfile);
+#endif /* _OPENMP */
+    
+  FILE *fp;
+  printf ("> Writing %s\n",fname);
+  fp = fopen(fname, "wb");
+    
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      fwrite (phi[j] + ibeg, sizeof(MyData), nx, fp);
+    }
+    
+  nfile++;
+  fclose(fp);
+}
diff --git a/jacobi/openmp/opt/src/tools.c b/jacobi/openmp/opt/src/tools.c
new file mode 100644
index 0000000000000000000000000000000000000000..be2761d170c23fde4d5d4aeda54af00e56abf3ce
--- /dev/null
+++ b/jacobi/openmp/opt/src/tools.c
@@ -0,0 +1,59 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "tools.h"
+
+/* ********************************************************************* */
+MyData **Allocate_2DdblArray(const int nx, const int ny)
+/*
+ * Allocate memory for a double precision array with
+ * nx rows and ny columns
+ *********************************************************************** */
+{
+  MyData **buf = malloc(nx * sizeof(MyData *));
+  assert(buf != NULL);
+
+  buf[0] = (MyData *) malloc(nx * ny * sizeof(MyData));
+  assert(buf[0] != NULL);
+  
+  for (int j=1 ; j<nx ; j++)
+    buf[j] = buf[j-1] + ny;
+    
+  return buf;
+}
+
+/* ********************************************************************* */
+void Show_2DdblArray(const MyData **const A,
+		     const int            nx,
+		     const int            ny,
+		     const char *const    string)
+/* *********************************************************************** */
+{
+  printf ("%s\n",string);
+  printf ("------------------------------\n");
+  for (int i=0 ; i<nx ; i++)
+    {
+      for (int j=0 ; j<ny ; j++)
+	{
+	  printf ("%8.2f  ", A[i][j]);
+	}
+      printf ("\n");
+    }
+  printf ("------------------------------\n");
+}
+/* ********************************************************************* */
+
+double seconds()
+{
+  struct timeval tmp;
+  gettimeofday(&tmp, (struct timezone *)0);
+  double sec = tmp.tv_sec + ((double)tmp.tv_usec)/1000000.0;
+
+  return sec;
+}
diff --git a/jacobi/serial/not_opt/Makefile b/jacobi/serial/not_opt/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..01dd641863b57c577e1dcc2de14db298269cbf62
--- /dev/null
+++ b/jacobi/serial/not_opt/Makefile
@@ -0,0 +1,62 @@
+#######################################################################
+# Author: David Goz (david.goz@inaf.it)                               #
+# June  2024                                                          #
+#######################################################################
+#
+# To see all the compilation options
+# $ make info
+#######################################################################
+
+# make.def defines how the application is compiled
+include make.def
+#######################################################################
+
+.PHONY: info serial valgrind_memcheck valgrind_callgrind valgrind_cachegrind debug clean 
+
+info:
+	@echo ' '
+	@echo '-----------------------------------------------------------------------------------------------'
+	@echo '$$ make serial                 ---> compile the serial application                             '
+	@echo '$$ make debug                  ---> debug the serial application                               '
+	@echo '$$ make valgrind_memcheck      ---> run the serial application using Valgrind under Memcheck   '
+	@echo '$$ make valgrind_callgrind     ---> run the serial application using Valgrind under Callgrind  '
+	@echo '$$ make valgrind_cachegrind    ---> run the serial application using Valgrind under Cachegrind '
+	@echo '$$ make clean                  ---> clean up all                                               '
+	@echo '$$ make info                   ---> get make info                                              '
+	@echo '-----------------------------------------------------------------------------------------------'
+	@echo ' '
+
+serial: $(PROG)
+
+debug: $(PROG_DEBUG)
+	@echo 'OOOooo... debugging ...oooOOO'
+	gdb --args ./$<
+	@echo 'OOOooo... debugging ... oooOOO'
+
+valgrind_memcheck: $(PROG_MEMCHECK)
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+	valgrind --tool=memcheck -s --leak-check=full --show-leak-kinds=all --track-origins=yes --read-var-info=yes --log-file=valgrind_memcheck_log_%p.txt ./$< 10 10
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+
+valgrind_callgrind: $(PROG_CALLGRIND)
+	@echo 'oooOOO... valgrind_callgrind ...OOOooo'
+	valgrind --tool=callgrind --dump-instr=yes --collect-jumps=yes --log-file=valgrind_callgrind_log_.%p.txt ./$< 128 128
+	@echo ' '
+	@echo 'To generate a function-by-function summary from the profile data file:'
+	@echo '$$ callgrind_annotate --auto=yes callgrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+
+valgrind_cachegrind: $(PROG_CACHEGRIND)
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+	valgrind --tool=cachegrind --log-file=valgrind_cachegrind_log_.%p.txt ./$< 128 128
+	@echo '$$ cg_annotate --auto=yes cachegrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+
+clean:
+	rm -f *~ .*~ ./src/*~ ./src/*# ./include/*~ ./include/*# *~
+	rm -f $(PROG) $(PROG_DEBUG) $(PROG_MEMCHECK) $(PROG_CALLGRIND) $(PROG_CACHEGRIND)
+	rm -f valgrind_*.txt
+	rm -f cachegrind.out.*
+	rm -f callgrind.*
+	rm -f *bin
diff --git a/jacobi/serial/not_opt/include/allvars.h b/jacobi/serial/not_opt/include/allvars.h
new file mode 100644
index 0000000000000000000000000000000000000000..468de4a19bebf2c0d5c59cd24ad3aab94b8df0f6
--- /dev/null
+++ b/jacobi/serial/not_opt/include/allvars.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#define NGHOST   1
+#define NDIM     2 /* 2D */
+#define X        0
+#define Y        1
+#define TOL      1.e-5
+
+typedef double MyData;
diff --git a/jacobi/serial/not_opt/include/tools.h b/jacobi/serial/not_opt/include/tools.h
new file mode 100644
index 0000000000000000000000000000000000000000..82859ec511710474547355ede3fc38cf33a92154
--- /dev/null
+++ b/jacobi/serial/not_opt/include/tools.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "allvars.h"
+#include <assert.h>
+#include <sys/time.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/* function prototypes */
+MyData **Allocate_2DdblArray(const int nx, const int ny);
+void Show_2DdblArray(const MyData **const A,
+                     const int            nx,
+                     const int            ny,
+                     const char *const    string);
+
+double seconds(void);
diff --git a/jacobi/serial/not_opt/make.def b/jacobi/serial/not_opt/make.def
new file mode 100644
index 0000000000000000000000000000000000000000..86d495638cbbd34c831b616944ed89375e47a43a
--- /dev/null
+++ b/jacobi/serial/not_opt/make.def
@@ -0,0 +1,45 @@
+CC     ?= gcc
+CFLAGS ?= -Wall -Wextra -march=native
+LIBS   ?= -lm
+
+SYSTYPE = $(strip $(shell uname -n))
+
+PROG           ?= jacobi_serial_not_opt_$(SYSTYPE)
+PROG_DEBUG      = $(PROG)_DEBUG
+PROG_MEMCHECK   = $(PROG)_MEMCHECK
+PROG_CALLGRIND  = $(PROG)_CALLGRIND
+PROG_CACHEGRIND = $(PROG)_CACHEGRIND
+
+HEADERS       = $(wildcard ./include/*.h)
+SOURCES       = $(wildcard ./src/*.c)
+DEPENDENCIES  = $(SOURCES) $(HEADERS) Makefile
+
+$(PROG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -O3 -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_DEBUG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -ggdb3 -fno-omit-frame-pointer -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_DEBUG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_MEMCHECK): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_MEMCHECK) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CALLGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CALLGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CACHEGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CACHEGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
diff --git a/jacobi/serial/not_opt/script/README b/jacobi/serial/not_opt/script/README
new file mode 100644
index 0000000000000000000000000000000000000000..af3694a9dffe54c3a06ea8f1b574e09613071cf3
--- /dev/null
+++ b/jacobi/serial/not_opt/script/README
@@ -0,0 +1,7 @@
+This folder contains bash scripts conceived to test
+the C application varying the number of
+MPI tasks, OMP threads, and particles.
+
+- parameters setting 'input_parameters'
+- executable of the application should reside in the parent directory
+- $ ./run.sh
\ No newline at end of file
diff --git a/jacobi/serial/not_opt/script/input_parameters b/jacobi/serial/not_opt/script/input_parameters
new file mode 100644
index 0000000000000000000000000000000000000000..8d6804aece28848578979767e3f3247e669e6095
--- /dev/null
+++ b/jacobi/serial/not_opt/script/input_parameters
@@ -0,0 +1,23 @@
+##########################################################################
+####################### paramfile ########################################
+
+# set the template of the paramfile
+TEMPLATE="paramfile_template.txt"
+# set the name of the paramfile used for each run
+PARAMFILE="paramfile_JUWELS.txt"
+
+##########################################################################
+####################### run parameters ###################################
+
+# Delete output files (Yes if 1)
+DELETE_OUTPUT_FILES=0
+# Output directory
+OUTPUTDIR="./TEST/"
+# number of particles
+N=(65536)
+# MPI tasks
+MPI=(1 2)
+# OMP threads
+THREADS=(1 2 4)
+
+##########################################################################
diff --git a/jacobi/serial/not_opt/script/run.sh b/jacobi/serial/not_opt/script/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d88cce02f2694fd9a6dcc487090e9cba366a5ac0
--- /dev/null
+++ b/jacobi/serial/not_opt/script/run.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+# compile the application
+# select flags inside the Makefile (parent directory)
+cd .. && make mpi_omp_step_profiling -f Makefile
+
+if [[ "$?" != "0" ]]
+then
+    echo "Cannot compile the application ...aborting..."
+    exit 1
+fi
+
+# the executable
+EXEC=$(find . -name *$(hostname)* -executable -type f)
+if [[ "$?" != "0" ]]
+then
+    echo "Cannot find the executable ...aborting..."
+    exit 2
+fi
+
+# return to ./script and source input parameters and function
+cd - && source input_parameters && source write_paramfile.sh
+
+# loop over particles
+for PART in "${N[@]}"
+do
+    # loop over MPI tasks
+    for TASK in "${MPI[@]}"
+    do
+	# loop over OMP threads
+	for THR in "${THREADS[@]}"
+	do
+	    # write the paramfile
+	    write_paramfile ${TEMPLATE}      ${PARAMFILE}       \
+			    ${DELETE_OUTPUT_FILES} ${OUTPUTDIR} \
+			    ${THR} ${PART}
+
+	    # check the status
+	    if [[ "$?" != "0" ]]
+	    then
+		echo "Error while writing ${PARAMFILE} ...aborting..."
+		exit 3
+	    fi
+
+	    # move the paramfile and move to the parent directory where the executable resides
+	    mv ${PARAMFILE} ../ && cd ..
+	    
+	    # run the application
+	    mpirun -np ${TASK} ${EXEC} ${PARAMFILE}
+	    # check exit status
+	    if [[ "$?" != "0" ]]
+	    then
+		echo "####################################################################"
+		echo "ERROR while running the app using: ${TASK} MPI tasks and ${GPU} GPUs"
+		echo "####################################################################"
+	    fi
+
+	    # delete the paramfile and return to the previous directory
+	    rm -f ${PARAMFILE} && cd -
+	done # loop over THR
+    done # loop over TASKS
+done # looop over particles
+
+# Everything is OK!
+exit 0
diff --git a/jacobi/serial/not_opt/script/run_JUWELS.sh b/jacobi/serial/not_opt/script/run_JUWELS.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b3908e9bb94c04dc637ca947c1958bda95e8c538
--- /dev/null
+++ b/jacobi/serial/not_opt/script/run_JUWELS.sh
@@ -0,0 +1,95 @@
+#!/bin/bash -x
+
+######################### RESOURSE ALLOCATION #####################################
+# ACCOUNT         = prpb93
+# NODES           = 2                (2 nodes)
+# PARTITION       = batch            (use Cluster Module)
+# ERROR           = Nbody-err.%j     (error handle)
+# TIME            = 00:15:00         (enough for tests)
+# EXCLUSIVE       = node should not be shared with other jobs
+
+#SBATCH --account=prpb93
+#SBATCH --nodes=1
+#SBATCH --partition=batch
+#SBATCH --error=Nbody-err.%j
+#SBATCH --time=00:03:00
+#SBATCH --job-name="Nbody"
+#SBATCH --exclusive
+###################################################################################
+
+################################## MODULES ########################################
+# module purge
+module purge
+# load GCC
+module load GCC/9.3.0
+# load OpenMPI
+module load OpenMPI
+###################################################################################
+
+# compile the application
+# select flags inside the Makefile (parent directory)
+cd .. && make mpi_omp_step_profiling -f Makefile
+
+if [[ "$?" != "0" ]]
+then
+    echo "Cannot compile the application ...aborting..."
+    exit 1
+fi
+
+# the executable
+EXEC=$(find . -name *$(hostname)* -executable -type f)
+if [[ "$?" != "0" ]]
+then
+    echo "Cannot find the executable ...aborting..."
+    exit 2
+fi
+
+# return to ./script and source input parameters and function
+cd - && source input_parameters && source write_paramfile.sh
+
+# loop over particles
+for PART in "${N[@]}"
+do
+    # loop over MPI tasks
+    for TASK in "${MPI[@]}"
+    do
+	# loop over OMP threads
+	for THR in "${THREADS[@]}"
+	do
+	    # write the paramfile
+	    write_paramfile ${TEMPLATE}      ${PARAMFILE}       \
+			    ${DELETE_OUTPUT_FILES} ${OUTPUTDIR} \
+			    ${THR} ${PART}
+
+	    # check the status
+	    if [[ "$?" != "0" ]]
+	    then
+		echo "Error while writing ${PARAMFILE} ...aborting..."
+		exit 3
+	    fi
+
+	    # move the paramfile and move to the parent directory where the executable resides
+	    mv ${PARAMFILE} ../ && cd ..
+	    
+	    # run the application
+	    srun --ntasks ${TASK} ${EXEC} ${PARAMFILE} >& $SLURM_JOB_ID.log
+
+	    # check exit status
+	    if [[ "$?" != "0" ]]
+	    then
+		echo "####################################################################"
+		echo "ERROR while running the app using: ${TASK} MPI tasks and ${GPU} GPUs"
+		echo "####################################################################"
+	    fi
+
+	    # delete the paramfile and return to the previous directory
+	    rm -f ${PARAMFILE} && cd -
+	done # loop over THR
+    done # loop over TASKS
+done # looop over particles
+
+# make clean
+cd .. && make clean
+
+# Everything is OK!
+exit 0
diff --git a/jacobi/serial/not_opt/script/write_paramfile.sh b/jacobi/serial/not_opt/script/write_paramfile.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6af6591767b2574370a67b652024d7da4065c9c3
--- /dev/null
+++ b/jacobi/serial/not_opt/script/write_paramfile.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# NAME: write_paramfile
+# PURPOSE: to write a user defined paramfile for the X NBody app
+# USAGE: write_paramfile [template_file] [paramfile_to_write]      \
+#                        [DeleteOutputFiles] [OutputDir]           \
+#                        [OMP threads] [N]
+
+function write_paramfile()
+{
+    local TEMPLATE=$1
+
+    # check if TEMPLATE is a regular file
+    if [ ! -f ${TEMPLATE} ]
+    then
+	echo "${TEMPLATE} is not a regular file.. aborting..."
+	return 1
+    fi
+
+    # copy template file
+    local PARAMFILE=$2
+    cp -v ${TEMPLATE} ${PARAMFILE}
+
+    local DELETE_OUTPUT_FILES=$3
+    local OUTPUTDIR=$4
+    local THREADS=$5
+    local N=$6
+    
+    # add parameters
+    echo "DeleteOutputFiles ${DELETE_OUTPUT_FILES}" >> ${PARAMFILE}
+    echo "OutputDir         ${OUTPUTDIR}"           >> ${PARAMFILE}
+    echo "RequestedNThreads ${THREADS}"             >> ${PARAMFILE}
+    echo "N                 ${N}"                   >> ${PARAMFILE}
+
+    echo "${PARAMFILE} written"
+    
+    return 0
+}
diff --git a/jacobi/serial/not_opt/src/jacobi_2D_serial_not_opt.c b/jacobi/serial/not_opt/src/jacobi_2D_serial_not_opt.c
new file mode 100644
index 0000000000000000000000000000000000000000..ba53f321242432290dca329fae94271189fadf82
--- /dev/null
+++ b/jacobi/serial/not_opt/src/jacobi_2D_serial_not_opt.c
@@ -0,0 +1,274 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "allvars.h"
+#include "tools.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+
+/* function prototypes */
+void BoundaryConditions(MyData **const restrict,
+			MyData  *const restrict,
+			MyData  *const restrict,
+			const int,
+			const int);
+
+void JacobiAlgorithm(MyData **const restrict, MyData **const restrict, MyData *const restrict,
+		     const MyData *const restrict, const int, const int, const int, const int);
+
+void WriteSolution(MyData **const phi, const int nx, const int ny);
+
+void copy_grids(MyData **const restrict A,
+                MyData **const restrict B,
+                const int               xbeg,
+                const int               xend,
+                const int               ybeg,
+		const int               yend);
+
+int main(int argc, char **argv)
+{
+  if (argc <= 2)
+    {
+      printf("\n\t Usage: <executable> <x_grid_size> <y_grid_size> \n\n");
+      exit(EXIT_FAILURE);
+    }
+
+  /* global X and Y grid size */
+  const int NX_GLOB = (int) strtol(argv[1], NULL, 10);
+  const int NY_GLOB = (int) strtol(argv[2], NULL, 10);
+
+  const MyData xbeg = 0.0;
+  const MyData xend = 1.0;
+  const MyData ybeg = 0.0;
+  const MyData yend = 1.0;
+
+  const MyData delta[NDIM] = {(xend - xbeg)/(NX_GLOB + 1),
+			      (yend - ybeg)/(NY_GLOB + 1)};
+
+  /* --------------------------------------------------------
+     1. Set grid indices
+     -------------------------------------------------------- */
+    
+  const int ibeg   = NGHOST;
+  const int iend   = ibeg + NX_GLOB - 1;
+  const int nx     = iend - ibeg + 1;
+  const int nx_tot = nx + 2 * NGHOST;
+    
+  const int jbeg   = NGHOST;
+  const int jend   = jbeg + NY_GLOB - 1;
+  const int ny     = jend - jbeg + 1;
+  const int ny_tot = ny + 2 * NGHOST;
+
+  printf("\n\t Grid indices:");
+  printf("\n\t\t ibeg, iend = %d, %d; nx_tot = %d"    ,ibeg, iend, nx_tot);
+  printf("\n\t\t jbeg, jend = %d, %d; ny_tot = %d\n\n",jbeg, jend, ny_tot);
+    
+  /* --------------------------------------------------------
+     2. Generate grid, allocate memory
+     -------------------------------------------------------- */
+
+  /* memory allocation */
+  MyData *xg = (MyData *) malloc((NX_GLOB + 2*NGHOST) * sizeof(MyData));
+  MyData *yg = (MyData *) malloc((NY_GLOB + 2*NGHOST) * sizeof(MyData));
+  assert((xg != NULL) && (yg != NULL));
+
+  /* initial conditions */
+  for (int i=0 ; i<(NX_GLOB + 2*NGHOST) ; i++) xg[i] = xbeg + (i - ibeg + 1) * delta[X];
+  for (int j=0 ; j<(NY_GLOB + 2*NGHOST) ; j++) yg[j] = ybeg + (j - jbeg + 1) * delta[Y];
+  MyData *x = xg; /* Global and local grids are the same  */
+  MyData *y = yg; /* for serial version of the code       */
+
+  /* grids memory allocation */
+  MyData **phi  = Allocate_2DdblArray(ny_tot, nx_tot);
+  MyData **phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
+    
+  /* --------------------------------------------------------
+     3. Initialize solution array to 0
+     -------------------------------------------------------- */
+    
+  for (int j=jbeg ; j<=jend ; j++)
+    for (int i=ibeg ; i<=iend ; i++)
+      phi0[j][i] = 0.0;
+  
+  /* --------------------------------------------------------
+     4. Main iteration cycle
+     -------------------------------------------------------- */
+
+  const double time_start = seconds();
+
+  MyData err = 1.0;
+  /* iterations */
+  int k = 0;
+  while (err > TOL)
+    {        
+      /* -- 4a. Set boundary conditions first -- */
+        
+      BoundaryConditions(phi0, x, y, nx, ny);
+        
+      /* -- 4b. Jacobi's method and residual (interior points) -- */
+      /*       core algorithm                                     */
+      
+      err = 0.0;
+      JacobiAlgorithm(phi, phi0, &err, delta,
+		      ibeg, iend, jbeg, jend);
+      
+      /* -- 4c. Copy grids --*/
+      copy_grids(phi0, phi,
+		 jbeg, jend,
+		 ibeg, jend);
+
+      printf ("\n\t Iteration = %d - err = %lg\n",k, err);
+
+      /* increase the counter of loop iterations */
+      k++;
+    }
+    
+  WriteSolution(phi, nx, ny);
+
+  printf("\n\t NX_GLOB x NY_GLOB = %d x %d\n", NX_GLOB, NY_GLOB);
+  printf("\n\t Time = %lf [s]\n\n", seconds() - time_start);
+
+  // free memory
+  if (phi0)
+    {
+      free(phi0[0]);
+      free(phi0);
+    }
+
+  if (phi)
+    {
+      free(phi[0]);
+      free(phi);
+    }
+
+  if (yg)
+    free(yg);
+  
+  if (xg)
+    free(xg);
+  
+  return 0;
+}
+
+/* ********************************************************************* */
+void BoundaryConditions(MyData **const restrict phi,
+			MyData  *const restrict x,
+			MyData  *const restrict y,
+                        const int               nx,
+			const int               ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;
+  const int iend = ibeg + nx - 1;
+    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+
+  int i,j;
+  
+  /* -- Left -- */
+  i = ibeg - 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (1.0 - y[j]);
+    
+  /* -- Right -- */
+  i = jend + 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (y[j] * y[j]);
+    
+  /* -- Bottom -- */    
+  j = jbeg - 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = (1.0 - x[i]);
+    
+  /* -- Top -- */
+  j = jend + 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = x[i];
+
+  return;
+}
+
+/* ********************************************************************* */
+
+void JacobiAlgorithm(MyData      **const restrict Phi,
+		     MyData      **const restrict Phi0,
+		     MyData       *const restrict error,
+		     const MyData *const restrict delta,
+		     const int                    ibeg,
+		     const int                    iend,
+		     const int                    jbeg,
+		     const int                    jend)
+{
+  *error = 0.0;
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      for (int i=ibeg ; i<=iend ; i++)
+	{
+	  Phi[j][i] = 0.25 * (Phi0[j][i-1] + Phi0[j][i+1] +
+			      Phi0[j-1][i] + Phi0[j+1][i]);
+                
+	  *error += delta[X] * delta[Y] * fabs(Phi[j][i] - Phi0[j][i]);
+	} /* loop over columns */
+    } /* loop over rows */
+  
+  return;
+}
+
+/* ********************************************************************* */
+
+/* ********************************************************************* */
+void copy_grids(MyData **const restrict A,
+		MyData **const restrict B,
+		const int               xbeg,
+		const int               xend,
+	        const int               ybeg,
+		const int               yend)
+{
+  for (int i=xbeg ; i<=xend ; i++)
+    for (int j=ybeg ; j<=yend ; j++)
+      A[i][j] = B[i][j];
+  
+  return;
+}
+/*
+************************************************************************ */
+
+
+/* ********************************************************************* */
+void WriteSolution (MyData **const phi,
+		    const int      nx,
+		    const int      ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+    
+  static int nfile = 0;  /* File counter */
+
+  char fname[32];
+  sprintf(fname,"jacobi2D_serial_not_opt_%02d.bin", nfile);
+    
+  FILE *fp;
+  printf ("> Writing %s\n",fname);
+  fp = fopen(fname, "wb");
+    
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      fwrite (phi[j] + ibeg, sizeof(MyData), nx, fp);
+    }
+    
+  nfile++;
+  fclose(fp);
+}
diff --git a/jacobi/serial/not_opt/src/tools.c b/jacobi/serial/not_opt/src/tools.c
new file mode 100644
index 0000000000000000000000000000000000000000..be2761d170c23fde4d5d4aeda54af00e56abf3ce
--- /dev/null
+++ b/jacobi/serial/not_opt/src/tools.c
@@ -0,0 +1,59 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "tools.h"
+
+/* ********************************************************************* */
+MyData **Allocate_2DdblArray(const int nx, const int ny)
+/*
+ * Allocate memory for a double precision array with
+ * nx rows and ny columns
+ *********************************************************************** */
+{
+  MyData **buf = malloc(nx * sizeof(MyData *));
+  assert(buf != NULL);
+
+  buf[0] = (MyData *) malloc(nx * ny * sizeof(MyData));
+  assert(buf[0] != NULL);
+  
+  for (int j=1 ; j<nx ; j++)
+    buf[j] = buf[j-1] + ny;
+    
+  return buf;
+}
+
+/* ********************************************************************* */
+void Show_2DdblArray(const MyData **const A,
+		     const int            nx,
+		     const int            ny,
+		     const char *const    string)
+/* *********************************************************************** */
+{
+  printf ("%s\n",string);
+  printf ("------------------------------\n");
+  for (int i=0 ; i<nx ; i++)
+    {
+      for (int j=0 ; j<ny ; j++)
+	{
+	  printf ("%8.2f  ", A[i][j]);
+	}
+      printf ("\n");
+    }
+  printf ("------------------------------\n");
+}
+/* ********************************************************************* */
+
+double seconds()
+{
+  struct timeval tmp;
+  gettimeofday(&tmp, (struct timezone *)0);
+  double sec = tmp.tv_sec + ((double)tmp.tv_usec)/1000000.0;
+
+  return sec;
+}
diff --git a/jacobi/serial/opt/Makefile b/jacobi/serial/opt/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..01dd641863b57c577e1dcc2de14db298269cbf62
--- /dev/null
+++ b/jacobi/serial/opt/Makefile
@@ -0,0 +1,62 @@
+#######################################################################
+# Author: David Goz (david.goz@inaf.it)                               #
+# June  2024                                                          #
+#######################################################################
+#
+# To see all the compilation options
+# $ make info
+#######################################################################
+
+# make.def defines how the application is compiled
+include make.def
+#######################################################################
+
+.PHONY: info serial valgrind_memcheck valgrind_callgrind valgrind_cachegrind debug clean 
+
+info:
+	@echo ' '
+	@echo '-----------------------------------------------------------------------------------------------'
+	@echo '$$ make serial                 ---> compile the serial application                             '
+	@echo '$$ make debug                  ---> debug the serial application                               '
+	@echo '$$ make valgrind_memcheck      ---> run the serial application using Valgrind under Memcheck   '
+	@echo '$$ make valgrind_callgrind     ---> run the serial application using Valgrind under Callgrind  '
+	@echo '$$ make valgrind_cachegrind    ---> run the serial application using Valgrind under Cachegrind '
+	@echo '$$ make clean                  ---> clean up all                                               '
+	@echo '$$ make info                   ---> get make info                                              '
+	@echo '-----------------------------------------------------------------------------------------------'
+	@echo ' '
+
+serial: $(PROG)
+
+debug: $(PROG_DEBUG)
+	@echo 'OOOooo... debugging ...oooOOO'
+	gdb --args ./$<
+	@echo 'OOOooo... debugging ... oooOOO'
+
+valgrind_memcheck: $(PROG_MEMCHECK)
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+	valgrind --tool=memcheck -s --leak-check=full --show-leak-kinds=all --track-origins=yes --read-var-info=yes --log-file=valgrind_memcheck_log_%p.txt ./$< 10 10
+	@echo 'oooOOO... valgrind_memcheck ...OOOooo'
+
+valgrind_callgrind: $(PROG_CALLGRIND)
+	@echo 'oooOOO... valgrind_callgrind ...OOOooo'
+	valgrind --tool=callgrind --dump-instr=yes --collect-jumps=yes --log-file=valgrind_callgrind_log_.%p.txt ./$< 128 128
+	@echo ' '
+	@echo 'To generate a function-by-function summary from the profile data file:'
+	@echo '$$ callgrind_annotate --auto=yes callgrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+
+valgrind_cachegrind: $(PROG_CACHEGRIND)
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+	valgrind --tool=cachegrind --log-file=valgrind_cachegrind_log_.%p.txt ./$< 128 128
+	@echo '$$ cg_annotate --auto=yes cachegrind.out.<pid> | less'
+	@echo '(kcachegrind is required in order to visualize the output using the GUI)'
+	@echo 'oooOOO... valgrind_cachegrind ...OOOooo'
+
+clean:
+	rm -f *~ .*~ ./src/*~ ./src/*# ./include/*~ ./include/*# *~
+	rm -f $(PROG) $(PROG_DEBUG) $(PROG_MEMCHECK) $(PROG_CALLGRIND) $(PROG_CACHEGRIND)
+	rm -f valgrind_*.txt
+	rm -f cachegrind.out.*
+	rm -f callgrind.*
+	rm -f *bin
diff --git a/jacobi/serial/opt/include/allvars.h b/jacobi/serial/opt/include/allvars.h
new file mode 100644
index 0000000000000000000000000000000000000000..468de4a19bebf2c0d5c59cd24ad3aab94b8df0f6
--- /dev/null
+++ b/jacobi/serial/opt/include/allvars.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#define NGHOST   1
+#define NDIM     2 /* 2D */
+#define X        0
+#define Y        1
+#define TOL      1.e-5
+
+typedef double MyData;
diff --git a/jacobi/serial/opt/include/tools.h b/jacobi/serial/opt/include/tools.h
new file mode 100644
index 0000000000000000000000000000000000000000..82859ec511710474547355ede3fc38cf33a92154
--- /dev/null
+++ b/jacobi/serial/opt/include/tools.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "allvars.h"
+#include <assert.h>
+#include <sys/time.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/* function prototypes */
+MyData **Allocate_2DdblArray(const int nx, const int ny);
+void Show_2DdblArray(const MyData **const A,
+                     const int            nx,
+                     const int            ny,
+                     const char *const    string);
+
+double seconds(void);
diff --git a/jacobi/serial/opt/make.def b/jacobi/serial/opt/make.def
new file mode 100644
index 0000000000000000000000000000000000000000..55e3b7d2673ab472796fc226043b98e57802d656
--- /dev/null
+++ b/jacobi/serial/opt/make.def
@@ -0,0 +1,45 @@
+CC     ?= gcc
+CFLAGS ?= -Wall -Wextra -march=native
+LIBS   ?= -lm
+
+SYSTYPE = $(strip $(shell uname -n))
+
+PROG           ?= jacobi_serial_opt_$(SYSTYPE)
+PROG_DEBUG      = $(PROG)_DEBUG
+PROG_MEMCHECK   = $(PROG)_MEMCHECK
+PROG_CALLGRIND  = $(PROG)_CALLGRIND
+PROG_CACHEGRIND = $(PROG)_CACHEGRIND
+
+HEADERS       = $(wildcard ./include/*.h)
+SOURCES       = $(wildcard ./src/*.c)
+DEPENDENCIES  = $(SOURCES) $(HEADERS) Makefile
+
+$(PROG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -O3 -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_DEBUG): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -ggdb3 -fno-omit-frame-pointer -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_DEBUG) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_MEMCHECK): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -Og -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_MEMCHECK) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CALLGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CALLGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
+
+$(PROG_CACHEGRIND): $(DEPENDENCIES)
+	$(CC) $(CFLAGS) -g -O3 -I./include $(SOURCES) -o $@ $(LIBS)
+	@echo ' '
+	@echo 'Program' $(PROG_CACHEGRIND) 'compiled for' $(SYSTYPE) 'machine'
+	@echo ' '
diff --git a/jacobi/serial/opt/src/jacobi_2D_serial_opt.c b/jacobi/serial/opt/src/jacobi_2D_serial_opt.c
new file mode 100644
index 0000000000000000000000000000000000000000..a14c810679a0d4087e1df3159ec7f1a8d98f238f
--- /dev/null
+++ b/jacobi/serial/opt/src/jacobi_2D_serial_opt.c
@@ -0,0 +1,254 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "allvars.h"
+#include "tools.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+
+/* function prototypes */
+void BoundaryConditions(MyData **const restrict,
+			MyData  *const restrict,
+			MyData  *const restrict,
+			const int,
+			const int);
+
+void JacobiAlgorithm(MyData **const restrict, MyData **const restrict, MyData *const restrict,
+		     const MyData *const restrict, const int, const int, const int, const int);
+
+void WriteSolution(MyData **const phi, const int nx, const int ny);
+
+int main(int argc, char **argv)
+{
+  if (argc <= 2)
+    {
+      printf("\n\t Usage: <executable> <x_grid_size> <y_grid_size> \n\n");
+      exit(EXIT_FAILURE);
+    }
+
+  /* global X and Y grid size */
+  const int NX_GLOB = (int) strtol(argv[1], NULL, 10);
+  const int NY_GLOB = (int) strtol(argv[2], NULL, 10);
+
+  const MyData xbeg = 0.0;
+  const MyData xend = 1.0;
+  const MyData ybeg = 0.0;
+  const MyData yend = 1.0;
+
+  const MyData delta[NDIM] = {(xend - xbeg)/(NX_GLOB + 1),
+			      (yend - ybeg)/(NY_GLOB + 1)};
+
+  /* --------------------------------------------------------
+     1. Set grid indices
+     -------------------------------------------------------- */
+    
+  const int ibeg   = NGHOST;
+  const int iend   = ibeg + NX_GLOB - 1;
+  const int nx     = iend - ibeg + 1;
+  const int nx_tot = nx + 2 * NGHOST;
+    
+  const int jbeg   = NGHOST;
+  const int jend   = jbeg + NY_GLOB - 1;
+  const int ny     = jend - jbeg + 1;
+  const int ny_tot = ny + 2 * NGHOST;
+
+  printf("\n\t Grid indices:");
+  printf("\n\t\t ibeg, iend = %d, %d; nx_tot = %d"    ,ibeg, iend, nx_tot);
+  printf("\n\t\t jbeg, jend = %d, %d; ny_tot = %d\n\n",jbeg, jend, ny_tot);
+    
+  /* --------------------------------------------------------
+     2. Generate grid, allocate memory
+     -------------------------------------------------------- */
+
+  /* memory allocation */
+  MyData *xg = (MyData *) malloc((NX_GLOB + 2*NGHOST) * sizeof(MyData));
+  MyData *yg = (MyData *) malloc((NY_GLOB + 2*NGHOST) * sizeof(MyData));
+  assert((xg != NULL) && (yg != NULL));
+
+  /* initial conditions */
+  for (int i=0 ; i<(NX_GLOB + 2*NGHOST) ; i++) xg[i] = xbeg + (i - ibeg + 1) * delta[X];
+  for (int j=0 ; j<(NY_GLOB + 2*NGHOST) ; j++) yg[j] = ybeg + (j - jbeg + 1) * delta[Y];
+  MyData *x = xg; /* Global and local grids are the same  */
+  MyData *y = yg; /* for serial version of the code       */
+
+  /* grids memory allocation */
+  MyData **phi  = Allocate_2DdblArray(ny_tot, nx_tot);
+  MyData **phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
+    
+  /* --------------------------------------------------------
+     3. Initialize solution array to 0
+     -------------------------------------------------------- */
+    
+  for (int j=jbeg ; j<=jend ; j++)
+    for (int i=ibeg ; i<=iend ; i++)
+      phi0[j][i] = 0.0;
+  
+  /* --------------------------------------------------------
+     4. Main iteration cycle
+     -------------------------------------------------------- */
+
+  const double time_start = seconds();
+
+  /* -- 4a. Set boundary conditions first -- */
+  
+  BoundaryConditions(phi0, x, y, nx, ny);
+  BoundaryConditions(phi, x, y, nx, ny);
+
+  MyData err = 1.0;
+  /* iterations */
+  int k = 0;
+  while (1)
+    {        
+      /* -- 4b. Jacobi's method and residual (interior points) -- */
+      /*       core algorithm                                     */
+      
+      err = 0.0;
+      JacobiAlgorithm(phi, phi0, &err, delta,
+		      ibeg, iend, jbeg, jend);
+
+      printf ("\n\t Iteration = %d - err = %lg\n",k, err);
+
+      /* increase the counter of loop iterations */
+      k++;
+
+      /* check convergence */
+      if (err <= TOL)
+	break;
+      
+      /* swap the pointers */
+      MyData **tmp = phi;
+      phi = phi0;
+      phi0 = tmp;
+    }
+    
+  WriteSolution(phi, nx, ny);
+
+  printf("\n\t NX_GLOB x NY_GLOB = %d x %d\n", NX_GLOB, NY_GLOB);
+  printf("\n\t Time = %lf [s]\n\n", seconds() - time_start);
+
+  // free memory
+  if (phi0)
+    {
+      free(phi0[0]);
+      free(phi0);
+    }
+
+  if (phi)
+    {
+      free(phi[0]);
+      free(phi);
+    }
+
+  if (yg)
+    free(yg);
+  
+  if (xg)
+    free(xg);
+  
+  return 0;
+}
+
+/* ********************************************************************* */
+void BoundaryConditions(MyData **const restrict phi,
+			MyData  *const restrict x,
+			MyData  *const restrict y,
+                        const int               nx,
+			const int               ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;
+  const int iend = ibeg + nx - 1;
+    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+
+  int i,j;
+  
+  /* -- Left -- */
+  i = ibeg - 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (1.0 - y[j]);
+    
+  /* -- Right -- */
+  i = jend + 1;
+  for (int j=jbeg ; j<=jend ; j++)
+    phi[j][i] = (y[j] * y[j]);
+    
+  /* -- Bottom -- */    
+  j = jbeg - 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = (1.0 - x[i]);
+    
+  /* -- Top -- */
+  j = jend + 1;
+  for (int i=ibeg ; i<=iend ; i++)
+    phi[j][i] = x[i];
+
+  return;
+}
+
+/* ********************************************************************* */
+
+void JacobiAlgorithm(MyData      **const restrict Phi,
+		     MyData      **const restrict Phi0,
+		     MyData       *const restrict error,
+		     const MyData *const restrict delta,
+		     const int                    ibeg,
+		     const int                    iend,
+		     const int                    jbeg,
+		     const int                    jend)
+{
+  *error = 0.0;
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      for (int i=ibeg ; i<=iend ; i++)
+	{
+	  Phi[j][i] = 0.25 * (Phi0[j][i-1] + Phi0[j][i+1] +
+			      Phi0[j-1][i] + Phi0[j+1][i]);
+                
+	  *error += delta[X] * delta[Y] * fabs(Phi[j][i] - Phi0[j][i]);
+	} /* loop over columns */
+    } /* loop over rows */
+  
+  return;
+}
+
+/* ********************************************************************* */
+
+/* ********************************************************************* */
+void WriteSolution (MyData **const phi,
+		    const int      nx,
+		    const int      ny)
+/*
+*********************************************************************** */
+{
+  const int ibeg = NGHOST;    
+  const int jbeg = NGHOST;
+  const int jend = jbeg + ny - 1;
+    
+  static int nfile = 0;  /* File counter */
+
+  char fname[32];
+  sprintf(fname,"jacobi2D_serial_opt_%02d.bin", nfile);
+    
+  FILE *fp;
+  printf ("> Writing %s\n",fname);
+  fp = fopen(fname, "wb");
+    
+  for (int j=jbeg ; j<=jend ; j++)
+    {
+      fwrite (phi[j] + ibeg, sizeof(MyData), nx, fp);
+    }
+    
+  nfile++;
+  fclose(fp);
+}
diff --git a/jacobi/serial/opt/src/tools.c b/jacobi/serial/opt/src/tools.c
new file mode 100644
index 0000000000000000000000000000000000000000..be2761d170c23fde4d5d4aeda54af00e56abf3ce
--- /dev/null
+++ b/jacobi/serial/opt/src/tools.c
@@ -0,0 +1,59 @@
+/* ///////////////////////////////////////////////////////////////////// */
+/* Authors:  A. Mignone (mignone@to.infn.it)                             */
+/*           V. Cesare  (valentina.cesare@inaf.it)                       */
+/*           D. Goz     (david.goz@inaf.it)                              */
+/*                                                                       */
+/* Date   : June 2024                                                    */
+/*                                                                       */
+/* ///////////////////////////////////////////////////////////////////// */
+
+#include "tools.h"
+
+/* ********************************************************************* */
+MyData **Allocate_2DdblArray(const int nx, const int ny)
+/*
+ * Allocate memory for a double precision array with
+ * nx rows and ny columns
+ *********************************************************************** */
+{
+  MyData **buf = malloc(nx * sizeof(MyData *));
+  assert(buf != NULL);
+
+  buf[0] = (MyData *) malloc(nx * ny * sizeof(MyData));
+  assert(buf[0] != NULL);
+  
+  for (int j=1 ; j<nx ; j++)
+    buf[j] = buf[j-1] + ny;
+    
+  return buf;
+}
+
+/* ********************************************************************* */
+void Show_2DdblArray(const MyData **const A,
+		     const int            nx,
+		     const int            ny,
+		     const char *const    string)
+/* *********************************************************************** */
+{
+  printf ("%s\n",string);
+  printf ("------------------------------\n");
+  for (int i=0 ; i<nx ; i++)
+    {
+      for (int j=0 ; j<ny ; j++)
+	{
+	  printf ("%8.2f  ", A[i][j]);
+	}
+      printf ("\n");
+    }
+  printf ("------------------------------\n");
+}
+/* ********************************************************************* */
+
+double seconds()
+{
+  struct timeval tmp;
+  gettimeofday(&tmp, (struct timezone *)0);
+  double sec = tmp.tv_sec + ((double)tmp.tv_usec)/1000000.0;
+
+  return sec;
+}