diff --git a/cuda-omp/omp/7/mat_mult.c b/cuda-omp/omp/7/mat_mult.c
new file mode 100644
index 0000000000000000000000000000000000000000..8ad7fcd4c5ca812149749ffdfe12cbb98ab7ab32
--- /dev/null
+++ b/cuda-omp/omp/7/mat_mult.c
@@ -0,0 +1,185 @@
+////////////////////////////////////////////////////////////////////////////////////////////////
+// - Naive matrix multiplication algorithm
+//   for (size_t i=0 ; i<N ; i++)
+//      for (size_t j=0 ; j<N ; j++)
+//         for (size_t k=0 ; k<_N ; k++)
+//            C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
+//
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 31.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
+// - Run the code:
+//   $ ./classwork_omp
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <assert.h>
+#include <omp.h>
+#include <string.h>
+
+#define N                     512
+#define SIZE                  (N * N) // matrix size
+typedef double MyData;                // do not change
+
+#define LOOP 100
+#define NDEBUG
+
+double wall_time()
+{
+  struct timespec ts;
+  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
+
+  return ret;
+}
+
+void CPU_mat_mult(const MyData *const restrict A,
+		  const MyData *const restrict B,
+	                MyData *const restrict C,
+		  const size_t                 size)
+{
+  for (size_t i=0 ; i<size ; i++)
+    for (size_t j=0 ; j<size ; j++)
+      for (size_t k=0 ; k<size ; k++)
+	C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);
+
+  return;
+}
+
+void GPU_mat_mult(const MyData *const restrict A,
+		  const MyData *const restrict B,
+	                MyData *const restrict C,
+		  const size_t                 size)
+{
+  #pragma omp target
+  {
+    #pragma omp teams distribute num_teams(size)
+    for (size_t i=0 ; i<size ; i++)
+      {
+        #pragma omp parallel for num_threads(size)
+	for (size_t j=0 ; j<size ; j++)
+	  {
+	    MyData value = (MyData)0;
+	    for (size_t k=0 ; k<size ; k++)
+	      value += (A[(i * size) + k] * B[(k * size) + j]);
+
+	    C[(i * size) + j] = value;
+	  } // omp thread
+      } // omp teams
+   } // omp target
+
+  return;
+}
+
+void GPU_mat_mult_no_loops(const MyData *const restrict A,
+			   const MyData *const restrict B,
+			         MyData *const restrict C,
+			   const size_t                 size)
+{
+ #pragma omp target
+  {
+   #pragma omp teams num_teams(size)
+    {
+      const size_t team_size = (size * omp_get_team_num());
+      
+     #pragma omp parallel firstprivate(team_size) num_threads(size)
+      {
+         const size_t tid = omp_get_thread_num();
+	 MyData value = (MyData)0;
+	 for (size_t k=0 ; k<size ; k++)
+	   value += (A[team_size + k] * B[(k * size) + tid]);
+
+	 C[team_size + tid] = value;
+      } // omp threads      
+    } // omp teams
+  } // omp target
+
+  return;
+}
+
+void check(const MyData *const __restrict__ cpu_matrix,
+	   const MyData *const __restrict__ gpu_matrix)
+{
+  int flag;
+  for (size_t i=0 ; i<SIZE ; i++)
+    flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);
+
+  if (!flag)
+    printf("\n\t Result OK");
+  else
+    printf("\n\t Result wrong");
+  
+  return;
+}
+
+int main()
+{
+  double time;
+  MyData *buffer = (MyData *)calloc(4 * SIZE, sizeof(MyData));
+  assert(buffer != NULL);
+
+  // host reference matrix A
+  MyData *const restrict A     = buffer;
+  MyData *const restrict B     = A + SIZE;
+  MyData *const restrict C_CPU = B + SIZE;
+  MyData *const restrict C_GPU = C_CPU + SIZE;
+  for (size_t i=0 ; i<SIZE ; i++)
+    {
+      A[i] = drand48();
+      B[i] = drand48();
+    }
+
+  ////////////////////////// CPU naive algorithm //////////////////////////////////////////
+  CPU_mat_mult(A, B, C_CPU, N);
+  /////////////////////////////////////////////////////////////////////////////////////////
+
+  // copy/alloc data to the GPU
+  #pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE])
+
+  /////////////////////////// GPU naive algorithm ////////////////////////////////////////
+  time = 0.0;
+  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+    {
+      const double start = wall_time();
+      GPU_mat_mult(A, B, C_GPU, N);
+      time += (wall_time() - start);
+    }
+  
+  #pragma omp target update from(C_GPU[0:SIZE])
+  check(C_CPU, C_GPU);
+  printf("\n\t GPU naive time %lg [s]\n", (time / LOOP));
+  ////////////////////////////////////////////////////////////////////////////////
+
+  /////////////////////////// GPU naive no loops algorithm ////////////////////////////
+  time = 0.0;
+  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+    {
+      const double start = wall_time();
+      GPU_mat_mult_no_loops(A, B, C_GPU, N);
+      time += (wall_time() - start);
+    }
+  
+  #pragma omp target update from(C_GPU[0:SIZE])
+  check(C_CPU, C_GPU);
+  printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP));
+  ////////////////////////////////////////////////////////////////////////////////
+
+  // free CPU memory
+  free(buffer);
+  // free GPU memory
+  #pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C_CPU[0:SIZE])
+  
+  printf("\n");
+
+  return EXIT_SUCCESS;
+}
diff --git a/cuda-omp/omp/7/mat_mult_block.c b/cuda-omp/omp/7/mat_mult_block.c
new file mode 100644
index 0000000000000000000000000000000000000000..cd84ad1ee00f8f85afc523f058ceb3b6f29cb922
--- /dev/null
+++ b/cuda-omp/omp/7/mat_mult_block.c
@@ -0,0 +1,208 @@
+////////////////////////////////////////////////////////////////////////////////////////////////
+// - Block matrix multiplication algorithm
+//
+//   const size_t Nblocks = (N / Bsize);
+//
+//   // loop over blocks of matrix C
+//   for (size_t ib=0 ; ib<Nblocks ; ib++)
+//   {
+//      for (size_t jb=0 ; jb<Nblocks ; jb++)
+//      {
+//        
+//         // loop over blocks of rows of A       
+//         for (size_t kb=0 ; kb<Nblocks ; kb++)
+//
+//
+//
+//         for (size_t i=0 ; i<N ; i++)
+//            for (size_t j=0 ; j<N ; j++)
+//               for (size_t k=0 ; k<_N ; k++)
+//                  C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
+//
+//      } // jb
+//   } // ib
+// - Exploit shared-memory.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 31.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
+// - Run the code:
+//   $ ./classwork_omp
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <assert.h>
+#include <omp.h>
+#include <string.h>
+
+#define N                     512
+#define SIZE                  (N * N) // matrix size
+typedef double MyData;                // do not change
+#define BLOCKSIZE             32      // number of threads per block
+
+// sanity check
+#if BLOCKSIZE > 1024
+#error BLOCKSIZE cannot be larger than 1024
+#endif
+
+#define LOOP 100
+#define NDEBUG
+
+double wall_time()
+{
+  struct timespec ts;
+  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
+
+  return ret;
+}
+
+void CPU_mat_mult(const MyData *const restrict A,
+		  const MyData *const restrict B,
+	                MyData *const restrict C,
+		  const size_t                 size)
+{
+  for (size_t i=0 ; i<size ; i++)
+    for (size_t j=0 ; j<size ; j++)
+      for (size_t k=0 ; k<size ; k++)
+	C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);
+
+  return;
+}
+
+void GPU_mat_mult(const MyData *const restrict A,
+		  const MyData *const restrict B,
+	                MyData *const restrict C,
+		  const size_t                 size)
+{
+  #pragma omp target
+  {
+    #pragma omp teams distribute num_teams(size)
+    for (size_t i=0 ; i<size ; i++)
+      {
+        #pragma omp parallel for num_threads(size)
+	for (size_t j=0 ; j<size ; j++)
+	  {
+	    MyData value = (MyData)0;
+	    for (size_t k=0 ; k<size ; k++)
+	      value += (A[(i * size) + k] * B[(k * size) + j]);
+
+	    C[(i * size) + j] = value;
+	  } // omp thread
+      } // omp teams
+   } // omp target
+
+  return;
+}
+
+void GPU_mat_mult_no_loops(const MyData *const restrict A,
+			   const MyData *const restrict B,
+			         MyData *const restrict C,
+			   const size_t                 size)
+{
+ #pragma omp target
+  {
+   #pragma omp teams num_teams(size)
+    {
+      const size_t team_size = (size * omp_get_team_num());
+      
+     #pragma omp parallel firstprivate(team_size) num_threads(size)
+      {
+         const size_t tid = omp_get_thread_num();
+	 MyData value = (MyData)0;
+	 for (size_t k=0 ; k<size ; k++)
+	   value += (A[team_size + k] * B[(k * size) + tid]);
+
+	 C[team_size + tid] = value;
+      } // omp threads      
+    } // omp teams
+  } // omp target
+
+  return;
+}
+
+void check(const MyData *const __restrict__ cpu_matrix,
+	   const MyData *const __restrict__ gpu_matrix)
+{
+  int flag;
+  for (size_t i=0 ; i<SIZE ; i++)
+    flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);
+
+  if (!flag)
+    printf("\n\t Result OK");
+  else
+    printf("\n\t Result wrong");
+  
+  return;
+}
+
+int main()
+{
+  double time;
+  MyData *buffer = (MyData *)calloc(4 * SIZE, sizeof(MyData));
+  assert(buffer != NULL);
+
+  // host reference matrix A
+  MyData *const restrict A     = buffer;
+  MyData *const restrict B     = A + SIZE;
+  MyData *const restrict C_CPU = B + SIZE;
+  MyData *const restrict C_GPU = C_CPU + SIZE;
+  for (size_t i=0 ; i<SIZE ; i++)
+    {
+      A[i] = drand48();
+      B[i] = drand48();
+    }
+
+  ////////////////////////// CPU naive algorithm //////////////////////////////////////////
+  CPU_mat_mult(A, B, C_CPU, N);
+  /////////////////////////////////////////////////////////////////////////////////////////
+
+  // copy/alloc data to the GPU
+  #pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE])
+
+  /////////////////////////// GPU naive algorithm ////////////////////////////////////////
+  time = 0.0;
+  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+    {
+      const double start = wall_time();
+      GPU_mat_mult(A, B, C_GPU, N);
+      time += (wall_time() - start);
+    }
+  
+  #pragma omp target update from(C_GPU[0:SIZE])
+  check(C_CPU, C_GPU);
+  printf("\n\t GPU naive time %lg [s]\n", (time / LOOP));
+  ////////////////////////////////////////////////////////////////////////////////
+
+  /////////////////////////// GPU naive no loops algorithm ////////////////////////////
+  time = 0.0;
+  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+    {
+      const double start = wall_time();
+      GPU_mat_mult_no_loops(A, B, C_GPU, N);
+      time += (wall_time() - start);
+    }
+  
+  #pragma omp target update from(C_GPU[0:SIZE])
+  check(C_CPU, C_GPU);
+  printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP));
+  ////////////////////////////////////////////////////////////////////////////////
+
+  // free CPU memory
+  free(buffer);
+  // free GPU memory
+  #pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C_CPU[0:SIZE])
+  
+  printf("\n");
+
+  return EXIT_SUCCESS;
+}
diff --git a/cuda-omp/omp/miscellaneous/structure.c b/cuda-omp/omp/miscellaneous/structure.c
index c12b212f98c7a4d84a3c80829dcd2c305b47295e..fc0041c02e55eb438685ab113ca2da41b7b658a0 100644
--- a/cuda-omp/omp/miscellaneous/structure.c
+++ b/cuda-omp/omp/miscellaneous/structure.c
@@ -17,7 +17,8 @@
 #include <assert.h>
 #include <omp.h>
 
-#define N  8
+#define SIZE   8
+#define SIZE_2 (SIZE / 2)
 typedef double MyData;
 
 typedef struct my_span
@@ -29,26 +30,20 @@ typedef struct my_span
 void allocate(      span  *my_struct,
 	      const size_t size)
 {
-  span tmp;
   /* allocate the buffer on the host memory */
-  tmp.ptr = (MyData *)malloc(size * sizeof(MyData));
-  assert(tmp.ptr != NULL);
-  tmp.N = size;
-
-  /* declare how the object 'span' has to be mapped on the device */
-  #pragma omp declare mapper(span tmp) map(from: tmp, tmp.ptr[0: tmp.N])
-
-  my_struct->ptr = tmp.ptr;
-  my_struct->N   = tmp.N;
+  my_struct->ptr = (MyData *)calloc(size, sizeof(MyData));
+  assert(my_struct->ptr != NULL);
+  my_struct->N = size;
   
   return;
 }
 
-void print(const span *const A)
+void print(const span *const A,
+	   const char *const string)
 {
   printf("\n");
-  for (size_t i=0 ; i<A->N ; i++)
-    printf("\n\t array[%i] = %lg", i, A->ptr[i]);
+  for (int i=0 ; i<A->N ; i++)
+    printf("\n\t %s[%d] = %lg", string, i, A->ptr[i]);
   printf("\n\n");
   
   return;
@@ -56,27 +51,41 @@ void print(const span *const A)
 
 int main()
 {
-  span A, B;
+  span A, B, C;
 
-  allocate(&A, N);
-  allocate(&B, N);
+  allocate(&A, SIZE);
+  allocate(&B, SIZE);
+  allocate(&C, SIZE);
+
+  /* declare how the object 'span' has to be mapped on the device */
+  #pragma omp declare mapper(all  : span S) map(to: S) map(from: S.ptr[0    : S.N])
+  #pragma omp declare mapper(left : span S) map(to: S) map(from: S.ptr[0    : S.N/2])
+  #pragma omp declare mapper(right: span S) map(to: S) map(from: S.ptr[S.N/2: S.N/2])
 
   /* init on the GPU */
-  #pragma omp target
+#pragma omp target map(mapper(all): A) map(mapper(left): B) map(mapper(right): C)
   {
-    for (size_t i=0 ; i<N ; i++)
-      {
-	A.ptr[i] = (MyData)(i);
-	B.ptr[i] = (MyData)(2 * i);
-      }
+    #pragma omp loop
+    for (size_t i=0 ; i<SIZE ; i++)
+      A.ptr[i] = (MyData)(i);
+
+    #pragma omp loop
+    for (size_t ii=0 ; ii<SIZE_2 ; ii++)
+      B.ptr[ii] = (MyData)(ii);
+
+    #pragma omp loop
+    for (size_t iii=SIZE_2 ; iii<SIZE ; iii++)
+      C.ptr[iii] = (MyData)(iii);
   }
 
-  print(&A);
-  print(&B);
+  print(&A, "A");
+  print(&B, "B");
+  print(&C, "C");
   
   /* free the host's memory */
   free(A.ptr);
   free(B.ptr);
+  free(C.ptr);
 
   return 0;
 }
diff --git a/jacobi/serial/not_opt/jacobi_serial_not_opt_len14 b/jacobi/serial/not_opt/jacobi_serial_not_opt_len14
new file mode 100755
index 0000000000000000000000000000000000000000..f546329f4a6d492f2c2a4495e5a65dbeb9ea452d
Binary files /dev/null and b/jacobi/serial/not_opt/jacobi_serial_not_opt_len14 differ