diff --git a/cuda-omp/cuda/7/mat_mult.cu b/cuda-omp/cuda/7/mat_mult.cu
new file mode 100644
index 0000000000000000000000000000000000000000..302f3e5bee56438b02cd37672bff62f641c5d247
--- /dev/null
+++ b/cuda-omp/cuda/7/mat_mult.cu
@@ -0,0 +1,176 @@
+////////////////////////////////////////////////////////////////////////////////////////////////
+// - Naive matrix multiplication algorithm
+//   for (size_t i=0 ; i<N ; i++)
+//      for (size_t j=0 ; j<N ; j++)
+//         for (size_t k=0 ; k<_N ; k++)
+//            C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
+//
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 20.08.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc++ mat_mult.cu -o mat_mult
+// - Run the code:
+//   $ ./mat_mult
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <assert.h>
+#include <cuda.h>
+#include <string.h>
+#include <math.h>
+#include <float.h>
+
+#define N                     1024
+#define SIZE                  (N * N) // matrix size
+typedef double MyData;                // do not change
+#define BLOCKSIZE             1024
+
+// sanity check
+#if BLOCKSIZE > 1024
+#error BLOCKSIZE must be <= 1024
+#endif
+
+#define LOOP 100
+#define NDEBUG
+
+double wall_time()
+{
+  struct timespec ts;
+  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
+
+  return ret;
+}
+
+void CPU_mat_mult(const MyData *const restrict A,
+		  const MyData *const restrict B,
+	                MyData *const restrict C,
+		  const size_t                 size)
+{
+  for (size_t i=0 ; i<size ; i++)
+    for (size_t j=0 ; j<size ; j++)
+      for (size_t k=0 ; k<size ; k++)
+	C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);
+
+  return;
+}
+
+__global__ void GPU_mat_mult(const MyData *const restrict A,
+			     const MyData *const restrict B,
+			           MyData *const restrict C,
+			     const size_t                 size)
+{
+  const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (globalID >= SIZE)
+    return;
+  
+  const size_t i = globalID / size;
+  const size_t j = globalID % size;
+
+  MyData value = (MyData)0;
+  for (size_t k=0 ; k<size ; k++)
+    value += A[(i * N) + k] * B[(k * N) + j];
+
+  C[(i * N) + j] = value;
+  
+  return;
+}
+
+void check(const MyData *const __restrict__ cpu_matrix,
+	   const MyData *const __restrict__ gpu_matrix)
+{
+  int flag = 0;
+  for (size_t i=0 ; i<SIZE ; i++)
+    flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);
+
+  if (!flag)
+    printf("\n\t Result OK");
+  else
+    printf("\n\t Result wrong");
+
+#if !defined(NDEBUG)
+
+  for (size_t i=0 ; i<N ; i++)
+    {
+      printf("\n");
+      for (size_t j=0 ; j<N ; j++)
+	{
+	  const size_t index = ((i * N) + j);
+	  printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg",
+		 index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index]));
+	}
+    }
+  printf("\n");
+
+#endif // NDEBUG  
+  
+  return;
+}
+
+int main()
+{
+  double time;
+  MyData *buffer_cpu = (MyData *)calloc(4 * SIZE, sizeof(MyData));
+  assert(buffer_cpu != NULL);
+
+  // host reference matrix A
+  MyData *const restrict A_CPU = buffer_cpu;
+  MyData *const restrict B_CPU = A_CPU + SIZE;
+  MyData *const restrict C_CPU = B_CPU + SIZE;
+  MyData *const restrict C_GPU_host = C_CPU + SIZE;
+  for (size_t i=0 ; i<SIZE ; i++)
+    {
+      A_CPU[i] = drand48();
+      B_CPU[i] = drand48();
+    }
+
+  ////////////////////////// CPU naive algorithm //////////////////////////////////////////
+  CPU_mat_mult(A_CPU, B_CPU, C_CPU, N);
+  /////////////////////////////////////////////////////////////////////////////////////////
+
+  // copy/alloc data to the GPU
+  MyData *buffer_gpu = NULL;
+  cudaMalloc((void **)&buffer_gpu, (3 * SIZE * sizeof(MyData)));
+  MyData *const restrict A_GPU = buffer_gpu;
+  MyData *const restrict B_GPU = A_GPU + SIZE;
+  MyData *const restrict C_GPU = B_GPU + SIZE;
+  cudaMemcpy(A_GPU, A_CPU, (2 * SIZE * sizeof(MyData)), cudaMemcpyHostToDevice);
+
+  const dim3 block   = {BLOCKSIZE, 1, 1};
+  const dim3 nblocks = {(SIZE + BLOCKSIZE  -1)/BLOCKSIZE, 1, 1};
+  
+  /////////////////////////// GPU naive algorithm ////////////////////////////////////////
+  time = 0.0;
+  GPU_mat_mult<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
+  cudaDeviceSynchronize();
+  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+    {
+      const double start = wall_time();
+      GPU_mat_mult<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
+      cudaDeviceSynchronize();
+      time += (wall_time() - start);
+    }
+  cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
+  
+  check(C_CPU, C_GPU_host);
+  printf("\n\t GPU naive time %lg [s]\n", (time / LOOP));
+  /////////////////////////////////////////////////////////////////////////////////////
+
+  // free CPU memory
+  free(buffer_cpu);
+  // free GPU memory
+  cudaFree(buffer_gpu);
+  
+  printf("\n");
+
+  return EXIT_SUCCESS;
+}
diff --git a/cuda-omp/cuda/7/mat_mult_block.cu b/cuda-omp/cuda/7/mat_mult_block.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7808f021a87df0329a4784f72fd1966aa133e737
--- /dev/null
+++ b/cuda-omp/cuda/7/mat_mult_block.cu
@@ -0,0 +1,301 @@
+////////////////////////////////////////////////////////////////////////////////////////////////
+// - Block matrix multiplication algorithm
+//
+//   const size_t Nblocks = (N / Bsize);
+//
+//   // loop over blocks of matrix C
+//   for (size_t ib=0 ; ib<Nblocks ; ib++)
+//   {
+//      for (size_t jb=0 ; jb<Nblocks ; jb++)
+//      {
+//        
+//         // loop over blocks of rows of A       
+//         for (size_t kb=0 ; kb<Nblocks ; kb++)
+//         {
+//
+//           // Matrix multiplication within a block
+//           for (size_t i=(ib * Bsize) ; i<((ib + 1) * Bsize) ; i++)
+//             for (size_t j=(jb * Bsize) ; j<((jb + 1) * Bsize) ; j++)
+//               for (size_t k=(kb * Bsize) ; k<((kb + 1) * Bsize) ; k++)
+//                  C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
+//
+//         } // kb
+//      } // jb
+//   } // ib
+//
+// - Exploit GPU shared-memory.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 22.08.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc++ mat_mult_block.cu -o mat_mult_block
+// - Run the code:
+//   $ ./mat_mult_block
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <assert.h>
+#include <cuda.h>
+#include <string.h>
+#include <math.h>
+#include <float.h>
+
+#define N                    1024
+#define SIZE                 (N * N) // matrix size
+typedef double MyData;               // do not change
+#define BLOCK                16
+
+// sanity check
+#if BLOCK * BLOCK > 1024
+#error BLOCKSIZE must be <= 1024
+#endif
+
+#if BLOCK * BLOCK > SIZE
+#error BLOCKSIZE must be <= SIZE
+#endif
+
+#define LOOP 100
+#define NDEBUG
+
+double wall_time()
+{
+  struct timespec ts;
+  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
+
+  return ret;
+}
+
+void CPU_mat_mult(const MyData *const restrict A,
+		  const MyData *const restrict B,
+	                MyData *const restrict C,
+		  const size_t                 size)
+{
+  for (size_t i=0 ; i<size ; i++)
+    for (size_t j=0 ; j<size ; j++)
+      for (size_t k=0 ; k<size ; k++)
+	C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);
+
+  return;
+}
+
+void CPU_mat_mult_block(const MyData *const restrict A,
+			const MyData *const restrict B,
+	                      MyData *const restrict C,
+			const size_t                 size)
+{
+  const size_t Nblocks = (size / BLOCK);
+
+  // loop over blocks of matrix C
+  for (size_t ib=0 ; ib<Nblocks ; ib++)
+    {
+      for (size_t jb=0 ; jb<Nblocks ; jb++)
+	{       
+	  // loop over blocks of rows of A       
+	  for (size_t kb=0 ; kb<Nblocks ; kb++)
+	    {
+	      // Matrix multiplication within a block
+	      for (size_t i=(ib * BLOCK) ; i<((ib + 1) * BLOCK) ; i++)
+		for (size_t j=(jb * BLOCK) ; j<((jb + 1) * BLOCK) ; j++)
+		  for (size_t k=(kb * BLOCK) ; k<((kb + 1) * BLOCK) ; k++)
+		    C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
+	    } // kb
+	} // jb
+    } // ib
+  
+  return;
+}
+
+__global__ void GPU_mat_mult_block(const MyData *const restrict A,
+				   const MyData *const restrict B,
+			                 MyData *const restrict C,
+				   const size_t                 size)
+{
+  const size_t size2 = (size * size);
+  
+  const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (globalID >= size2)
+    return;
+  
+  const size_t Nblocks  = size / BLOCK;           // number of blocks to loop over A and B
+  const size_t ib       = blockIdx.x / Nblocks;   // indexes of starting matrix's block
+  const size_t jb       = blockIdx.x % Nblocks;
+  const size_t i_local  = threadIdx.x / BLOCK;    // local matrix's indexes mapped to each CUDA thread
+  const size_t j_local  = threadIdx.x % BLOCK;    // within its own block
+  const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
+  const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices
+  
+  C[(i_global * size) + j_global] = (MyData)0;
+
+  // loop over blocks
+  for (size_t block=0 ; block<Nblocks ; block++)
+    {
+      const size_t j_A = (block * BLOCK);
+      const size_t i_B = (block * BLOCK);
+
+      // perform the matrix multiplication within the block
+      MyData value = (MyData)0;
+      for (size_t k=0 ; k<BLOCK ; k++)
+	value += A[(i_global * size) + k + j_A] * B[((k + i_B) * size) + j_global];
+
+      C[(i_global * size) + j_global] += value;
+    }
+
+  return;
+}
+
+__global__ void GPU_mat_mult_block_shared(const MyData *const restrict A,
+					  const MyData *const restrict B,
+					        MyData *const restrict C,
+					  const size_t                 size)
+{
+  __shared__ MyData Ablock[BLOCK * BLOCK];
+  __shared__ MyData Bblock[BLOCK * BLOCK];
+  __shared__ MyData Cblock[BLOCK * BLOCK];
+  
+  const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
+  const size_t size2 = (size * size);
+  if (globalID >= size2)
+    return;
+  
+  const size_t Nblocks  = size / BLOCK;           // number of blocks to loop over
+  const size_t ib       = blockIdx.x / Nblocks;   // indexes of starting matrix's block
+  const size_t jb       = blockIdx.x % Nblocks;
+  const size_t i_local  = threadIdx.x / BLOCK;    // local matrix's indexes mapped to each CUDA thread
+  const size_t j_local  = threadIdx.x % BLOCK;    // within its own block
+  const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
+  const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices
+
+  // Init Cblock
+  Cblock[(i_local * BLOCK) + j_local] = (MyData)0;
+
+  // loop over blocks
+  for (size_t block=0 ; block<Nblocks ; block++)
+    {
+      const size_t j_A = (block * BLOCK);
+      const size_t i_B = (block * BLOCK);
+
+      // waits until all threads in the thread block have reached this point and shared memory accesses
+      // made by these threads prior to __syncthreads() are visible to all threads in the block.
+      __syncthreads();
+      
+      // copy block of A into shared memory
+      Ablock[(i_local * BLOCK) + j_local] = A[(i_global * size) + j_local + j_A];
+      // copy block of B into shared memory
+      Bblock[(i_local * BLOCK) + j_local] = B[((i_local + i_B) * size) + j_global];
+      
+      // waits until all threads in the thread block have reached this point and shared memory accesses      // made by these threads prior to __syncthreads() are visible to all threads in the block.
+      __syncthreads();
+
+      // perform the matrix multiplication within the block
+      MyData value = (MyData)0;
+      for (size_t k=0 ; k<BLOCK ; k++)
+	value += Ablock[(i_local * BLOCK) + k] * Bblock[(k * BLOCK) + j_local];
+
+      // store the partial result in shared memory
+      Cblock[(i_local * BLOCK) + j_local] += value;
+    }
+
+  C[(i_global * size) + j_global] = Cblock[(i_local * BLOCK) + j_local];
+  
+  return;
+}
+
+void check(const MyData *const restrict cpu_matrix,
+	   const MyData *const restrict gpu_matrix)
+{
+  int flag = 0;
+  for (size_t i=0 ; i<SIZE ; i++)
+    flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);
+
+  if (!flag)
+    printf("\n\t Result OK");
+  else
+    printf("\n\t Result wrong");
+  
+  return;
+}
+
+int main()
+{
+  double time;
+  MyData *buffer_cpu = (MyData *)calloc(4 * SIZE, sizeof(MyData));
+  assert(buffer_cpu != NULL);
+
+  // host reference matrix A
+  MyData *const restrict A_CPU = buffer_cpu;
+  MyData *const restrict B_CPU = A_CPU + SIZE;
+  MyData *const restrict C_CPU = B_CPU + SIZE;
+  MyData *const restrict C_GPU_host = C_CPU + SIZE;
+  for (size_t i=0 ; i<SIZE ; i++)
+    {
+      A_CPU[i] = drand48();
+      B_CPU[i] = drand48();
+    }
+
+  ////////////////////////// CPU naive algorithm //////////////////////////////////////////
+  CPU_mat_mult_block(A_CPU, B_CPU, C_CPU, N);
+  /////////////////////////////////////////////////////////////////////////////////////////
+
+  // copy/alloc data to the GPU
+  MyData *buffer_gpu = NULL;
+  cudaMalloc((void **)&buffer_gpu, (3 * SIZE * sizeof(MyData)));
+  MyData *const A_GPU = buffer_gpu;
+  MyData *const B_GPU = A_GPU + SIZE;
+  MyData *const C_GPU = B_GPU + SIZE;
+  cudaMemcpy(A_GPU, A_CPU, (2 * SIZE * sizeof(MyData)), cudaMemcpyHostToDevice);
+
+  const dim3 nblocks = {(SIZE + (BLOCK * BLOCK)  -1)/(BLOCK * BLOCK), 1, 1};
+  const dim3 block   = {(BLOCK * BLOCK), 1, 1};
+  
+  /////////////////////////// GPU naive block algorithm ////////////////////////////////////////
+  time = 0.0;
+  GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
+  cudaDeviceSynchronize();
+  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+    {
+      const double start = wall_time();
+      GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
+      cudaDeviceSynchronize();
+      time += (wall_time() - start);
+    }
+  cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
+  
+  check(C_CPU, C_GPU_host);
+  printf("\n\t GPU naive block time %lg [s]\n", (time / LOOP));
+  /////////////////////////////////////////////////////////////////////////////////////
+
+  /////////////////////////// GPU block shared memory algorithm ///////////////////////
+  time = 0.0;
+  GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
+  cudaDeviceSynchronize();
+  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+    {
+      const double start = wall_time();
+      GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
+      cudaDeviceSynchronize();
+      time += (wall_time() - start);
+    }
+  cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
+  
+  check(C_CPU, C_GPU_host);
+  printf("\n\t GPU block shared memory time %lg [s]\n", (time / LOOP));
+  /////////////////////////////////////////////////////////////////////////////////////
+
+  // free CPU memory
+  free(buffer_cpu);
+  // free GPU memory
+  cudaFree(buffer_gpu);
+  
+  printf("\n");
+
+  return EXIT_SUCCESS;
+}
diff --git a/cuda-omp/omp/6/classwork_omp b/cuda-omp/omp/6/classwork_omp
deleted file mode 100755
index 6766c7bdf174daf8fcd104c4bc67d02ba4885384..0000000000000000000000000000000000000000
Binary files a/cuda-omp/omp/6/classwork_omp and /dev/null differ
diff --git a/cuda-omp/omp/7/mat_mult.c b/cuda-omp/omp/7/mat_mult.c
index 8ad7fcd4c5ca812149749ffdfe12cbb98ab7ab32..789e7f2a73299e8fa3fcc7039efdd033a0da999e 100644
--- a/cuda-omp/omp/7/mat_mult.c
+++ b/cuda-omp/omp/7/mat_mult.c
@@ -14,9 +14,10 @@
 // code tested using nvhpc
 //
 // - Compile the code:
-//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v mat_mult.c -o mat_mult_omp
 // - Run the code:
-//   $ ./classwork_omp
+//   $ export OMP_TARGET_OFFLOAD=MANDATORY
+//   $ ./mat_mult_omp
 //////////////////////////////////////////////////////////////////////////////////////////////////
 
 #include <stdio.h>
@@ -26,8 +27,10 @@
 #include <assert.h>
 #include <omp.h>
 #include <string.h>
+#include <math.h>
+#include <float.h>
 
-#define N                     512
+#define N                     1024
 #define SIZE                  (N * N) // matrix size
 typedef double MyData;                // do not change
 
@@ -61,22 +64,30 @@ void GPU_mat_mult(const MyData *const restrict A,
 	                MyData *const restrict C,
 		  const size_t                 size)
 {
-  #pragma omp target
-  {
-    #pragma omp teams distribute num_teams(size)
-    for (size_t i=0 ; i<size ; i++)
-      {
-        #pragma omp parallel for num_threads(size)
-	for (size_t j=0 ; j<size ; j++)
-	  {
-	    MyData value = (MyData)0;
-	    for (size_t k=0 ; k<size ; k++)
-	      value += (A[(i * size) + k] * B[(k * size) + j]);
-
-	    C[(i * size) + j] = value;
-	  } // omp thread
-      } // omp teams
-   } // omp target
+  #pragma omp target teams distribute num_teams(size)
+  for (size_t i=0 ; i<size ; i++)
+    {
+      #pragma omp parallel for firstprivate(i) num_threads(size)
+      for (size_t j=0 ; j<size ; j++)
+	{
+#if !defined(NDEBUG)
+
+	  const int num_teams   = omp_get_num_teams();
+	  const int num_threads = omp_get_num_threads();
+
+	  if ((omp_get_team_num() == 0) && (omp_get_thread_num() == 0))
+	    printf("\n\t Kernel GPU_mat_mult: teams: %d - threads: %d\n",
+		   num_teams, num_threads);
+
+#endif // NDEBUG
+	  
+	  MyData value = (MyData)0;
+	  for (size_t k=0 ; k<size ; k++)
+	    value += (A[(i * size) + k] * B[(k * size) + j]);
+
+	  C[(i * size) + j] = value;
+	} // omp threads
+    } // omp target teams
 
   return;
 }
@@ -86,23 +97,31 @@ void GPU_mat_mult_no_loops(const MyData *const restrict A,
 			         MyData *const restrict C,
 			   const size_t                 size)
 {
- #pragma omp target
+ #pragma omp target teams num_teams(size)
   {
-   #pragma omp teams num_teams(size)
+    const size_t team_size = (size * omp_get_team_num());
+      
+   #pragma omp parallel firstprivate(team_size) num_threads(size)
     {
-      const size_t team_size = (size * omp_get_team_num());
+#if !defined(NDEBUG)
+
+      const int num_teams   = omp_get_num_teams();
+      const int num_threads = omp_get_num_threads();
       
-     #pragma omp parallel firstprivate(team_size) num_threads(size)
-      {
-         const size_t tid = omp_get_thread_num();
-	 MyData value = (MyData)0;
-	 for (size_t k=0 ; k<size ; k++)
-	   value += (A[team_size + k] * B[(k * size) + tid]);
-
-	 C[team_size + tid] = value;
-      } // omp threads      
-    } // omp teams
-  } // omp target
+      if ((omp_get_team_num() == 0) && (omp_get_thread_num() == 0))
+	printf("\n\t Kernel GPU_mat_mult_no_loops: teams: %d - threads: %d\n",
+	       num_teams, num_threads);
+
+#endif // NDEBUG
+      
+      const size_t tid = omp_get_thread_num();
+      MyData value = (MyData)0;
+      for (size_t k=0 ; k<size ; k++)
+	value += (A[team_size + k] * B[(k * size) + tid]);
+
+      C[team_size + tid] = value;
+    } // omp threads      
+  } // omp target teams
 
   return;
 }
@@ -110,14 +129,30 @@ void GPU_mat_mult_no_loops(const MyData *const restrict A,
 void check(const MyData *const __restrict__ cpu_matrix,
 	   const MyData *const __restrict__ gpu_matrix)
 {
-  int flag;
+  int flag = 0;
   for (size_t i=0 ; i<SIZE ; i++)
-    flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);
+    flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);
 
   if (!flag)
     printf("\n\t Result OK");
   else
     printf("\n\t Result wrong");
+
+#if !defined(NDEBUG)
+
+  for (size_t i=0 ; i<N ; i++)
+    {
+      printf("\n");
+      for (size_t j=0 ; j<N ; j++)
+	{
+	  const size_t index = ((i * N) + j);
+	  printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg",
+		 index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index]));
+	}
+    }
+  printf("\n");
+
+#endif // NDEBUG  
   
   return;
 }
@@ -148,6 +183,7 @@ int main()
 
   /////////////////////////// GPU naive algorithm ////////////////////////////////////////
   time = 0.0;
+  GPU_mat_mult(A, B, C_GPU, N); // warm-up
   for (unsigned short int loop=0 ; loop<LOOP ; loop++)
     {
       const double start = wall_time();
@@ -162,6 +198,7 @@ int main()
 
   /////////////////////////// GPU naive no loops algorithm ////////////////////////////
   time = 0.0;
+  GPU_mat_mult_no_loops(A, B, C_GPU, N); // warm-up
   for (unsigned short int loop=0 ; loop<LOOP ; loop++)
     {
       const double start = wall_time();
diff --git a/cuda-omp/omp/7/mat_mult_block.c b/cuda-omp/omp/7/mat_mult_block.c
index cd84ad1ee00f8f85afc523f058ceb3b6f29cb922..9c3d81e7352e7db3c5a330eb66a98934e1252824 100644
--- a/cuda-omp/omp/7/mat_mult_block.c
+++ b/cuda-omp/omp/7/mat_mult_block.c
@@ -11,29 +11,32 @@
 //        
 //         // loop over blocks of rows of A       
 //         for (size_t kb=0 ; kb<Nblocks ; kb++)
+//         {
 //
-//
-//
-//         for (size_t i=0 ; i<N ; i++)
-//            for (size_t j=0 ; j<N ; j++)
-//               for (size_t k=0 ; k<_N ; k++)
+//           // Matrix multiplication within a block
+//           for (size_t i=(ib * Bsize) ; i<((ib + 1) * Bsize) ; i++)
+//             for (size_t j=(jb * Bsize) ; j<((jb + 1) * Bsize) ; j++)
+//               for (size_t k=(kb * Bsize) ; k<((kb + 1) * Bsize) ; k++)
 //                  C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
 //
+//         } // kb
 //      } // jb
 //   } // ib
-// - Exploit shared-memory.
+//
+// - Exploit GPU shared-memory.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
 //////////////////////////////////////////////////////////////////////////////////////////////////
 // Author: David Goz
 // mail  : david.goz@inaf.it
-// date  : 31.07.2024
+// date  : 20.08.2024
 // code tested using nvhpc
 //
 // - Compile the code:
-//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v mat_mult_block.c -o mat_mult_block_omp
 // - Run the code:
-//   $ ./classwork_omp
+//   $ export OMP_TARGET_OFFLOAD=MANDATORY
+//   $ ./mat_mult_block_omp
 //////////////////////////////////////////////////////////////////////////////////////////////////
 
 #include <stdio.h>
@@ -43,15 +46,18 @@
 #include <assert.h>
 #include <omp.h>
 #include <string.h>
+#include <math.h>
+#include <float.h>
 
-#define N                     512
-#define SIZE                  (N * N) // matrix size
+#define N                   1024
+#define SIZE                (N * N) // matrix size
 typedef double MyData;                // do not change
-#define BLOCKSIZE             32      // number of threads per block
+#define _BLOCK_             16
+#define BLOCKSIZE           ((_BLOCK_) * (_BLOCK_)) 
 
 // sanity check
-#if BLOCKSIZE > 1024
-#error BLOCKSIZE cannot be larger than 1024
+#if _BLOCK_*_BLOCK_ > 1024
+#error _BLOCK_*_BLOCK_ cannot larger than 1024
 #endif
 
 #define LOOP 100
@@ -66,81 +72,244 @@ double wall_time()
   return ret;
 }
 
-void CPU_mat_mult(const MyData *const restrict A,
-		  const MyData *const restrict B,
-	                MyData *const restrict C,
-		  const size_t                 size)
+void CPU_mat_mult_block(const MyData *const restrict A,
+			const MyData *const restrict B,
+	                      MyData *const restrict C,
+			const size_t                 size)
 {
-  for (size_t i=0 ; i<size ; i++)
-    for (size_t j=0 ; j<size ; j++)
-      for (size_t k=0 ; k<size ; k++)
-	C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);
+  const size_t Nblocks = (size / _BLOCK_);
 
+  // loop over blocks of matrix C
+  for (size_t ib=0 ; ib<Nblocks ; ib++)
+    {
+      for (size_t jb=0 ; jb<Nblocks ; jb++)
+	{       
+	  // loop over blocks of rows of A       
+	  for (size_t kb=0 ; kb<Nblocks ; kb++)
+	    {
+	      // Matrix multiplication within a block
+	      for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
+		for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
+		  for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
+		    C[(i * size) + j] += A[(i * size) + k] * B[(k * size) + j];
+	    } // kb
+	} // jb
+    } // ib
+  
   return;
 }
 
-void GPU_mat_mult(const MyData *const restrict A,
-		  const MyData *const restrict B,
-	                MyData *const restrict C,
-		  const size_t                 size)
+void GPU_mat_mult_block(const MyData *const restrict A,
+			const MyData *const restrict B,
+	                      MyData *const restrict C,
+			const size_t                 size)
 {
-  #pragma omp target
-  {
-    #pragma omp teams distribute num_teams(size)
-    for (size_t i=0 ; i<size ; i++)
-      {
-        #pragma omp parallel for num_threads(size)
-	for (size_t j=0 ; j<size ; j++)
-	  {
-	    MyData value = (MyData)0;
-	    for (size_t k=0 ; k<size ; k++)
-	      value += (A[(i * size) + k] * B[(k * size) + j]);
-
-	    C[(i * size) + j] = value;
-	  } // omp thread
-      } // omp teams
-   } // omp target
+  const size_t Nblocks = (size / _BLOCK_);
+
+  #pragma omp target teams distribute collapse(2) num_teams(Nblocks * Nblocks) thread_limit(BLOCKSIZE)
+  for (size_t ib=0 ; ib<Nblocks ; ib++)
+    {
+      for (size_t jb=0 ; jb<Nblocks ; jb++)
+	{
+	  for (size_t kb=0 ; kb<Nblocks ; kb++)
+	    {
+              #pragma omp parallel for collapse(2) num_threads(BLOCKSIZE)
+	      for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
+		{
+		  for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
+		    {
+#if !defined(NDEBUG)
+
+		      const int num_teams   = omp_get_num_teams();
+		      const int num_threads = omp_get_num_threads();
+		      
+		      if ((omp_get_team_num() == 0) && (omp_get_thread_num() == 0) && (kb == 0))
+			printf("\n\t Kernel GPU_mat_mult: teams: %d - threads: %d\n",
+			       num_teams, num_threads);
+
+#endif // NDEBUG
+   
+		      MyData value = ((kb == 0) ? (MyData)0 : C[(i * size) + j]);
+		      for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
+			value += A[(i * size) + k] * B[(k * size) + j];
+
+		      C[(i * size) + j] = value;
+		    } // j
+		} // i
+	    } // kb
+	} // jb
+    } // ib
+
+  return;
+}
+
+void GPU_mat_mult_block_shared(const MyData *const restrict A,
+			       const MyData *const restrict B,
+			             MyData *const restrict C,
+			       const size_t                 size)
+{
+  const size_t Nblocks = (size / _BLOCK_);
+  // Team local copies of tiles
+  MyData Ablock[BLOCKSIZE];
+  MyData Bblock[BLOCKSIZE];  
+  MyData Cblock[BLOCKSIZE];
+  
+  //  #pragma omp target uses_allocators(omp_pteam_mem_alloc) allocate(omp_pteam_mem_alloc: Ablock, Bblock)
+  #pragma omp target teams distribute collapse(2) num_teams(Nblocks * Nblocks)	\
+                     private(Ablock, Bblock, Cblock)				\
+                     thread_limit(BLOCKSIZE)
+  for (size_t ib=0 ; ib<Nblocks ; ib++)
+    {
+      for (size_t jb=0 ; jb<Nblocks ; jb++)
+	{ 
+	  for (size_t kb=0 ; kb<Nblocks ; kb++)
+	    {
+              #pragma omp parallel num_threads(BLOCKSIZE)
+	      {
+#if !defined(NDEBUG)
+
+		const int num_teams   = omp_get_num_teams();
+		const int num_threads = omp_get_num_threads();
+		
+		if ((omp_get_team_num() == 0) && (omp_get_thread_num() == 0) && (kb == 0))
+		  printf("\n\t Kernel GPU_mat_mult: teams: %d - threads: %d\n",
+			 num_teams, num_threads);
+
+#endif // NDEBUG
+
+		if (kb == 0) // Cblock initialization
+		  {
+                    #pragma omp for collapse(2) nowait
+		    for (size_t i=0 ; i<_BLOCK_ ; i++)
+		      for (size_t j=0 ; j<_BLOCK_ ; j++)
+			Cblock[(i * _BLOCK_) + j] = (MyData)0;
+		  }
+		
+		// copy block of A into pteam memory Ablock
+                #pragma omp for collapse(2) nowait // implicit barrier is removed 	
+		for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
+		  for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
+		    Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] = A[(i * N) + k];
+
+		// copy block of B into pteam memory Bblock
+                #pragma omp for collapse(2) // implicit barrier to ensure that shared Ablock and Bblock can be accessed by all threads within the block
+		for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
+		  for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
+		    Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)] = B[(k * N) + j];
+
+		// matrix multiply within the block
+                #pragma omp for collapse(2) nowait
+		for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
+		  for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
+		    {
+		      MyData value = (MyData)0;
+		      for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
+			value += Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] *
+			         Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)];
+
+		      Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)] += value;
+
+		      if (kb == (Nblocks - 1))
+			C[(i * N) + j] = Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)];
+		    }
+	      } // omp parallel
+	    } // kb
+	} // jb
+    } // ib
 
   return;
 }
 
-void GPU_mat_mult_no_loops(const MyData *const restrict A,
-			   const MyData *const restrict B,
-			         MyData *const restrict C,
-			   const size_t                 size)
+void GPU_mat_mult_block_shared_no_loops(const MyData *const restrict A,
+					const MyData *const restrict B,
+					      MyData *const restrict C,
+					const size_t                 size)
 {
- #pragma omp target
+  const size_t Nblocks = (size / _BLOCK_);
+  // Team local copies of tiles
+  MyData Ablock[BLOCKSIZE];
+  MyData Bblock[BLOCKSIZE];  
+  MyData Cblock[BLOCKSIZE];
+  
+  #pragma omp target teams num_teams(Nblocks * Nblocks)	   \
+                           private(Ablock, Bblock, Cblock) \
+                           thread_limit(BLOCKSIZE)
   {
-   #pragma omp teams num_teams(size)
+    const size_t ib = omp_get_team_num() / Nblocks;
+    const size_t jb = omp_get_team_num() % Nblocks;
+
+    #pragma omp parallel num_threads(BLOCKSIZE)
     {
-      const size_t team_size = (size * omp_get_team_num());
+      // local matrix's indexes mapped to each OMP GPU-thread within its own team
+      const size_t i_local = omp_get_thread_num() / _BLOCK_;
+      const size_t j_local = omp_get_thread_num() % _BLOCK_;
+
+      // global matrix's indexes mapped to each OMP GPU-thread
+      const size_t i_global = i_local + (ib * _BLOCK_);
+      const size_t j_global = j_local + (jb * _BLOCK_);
       
-     #pragma omp parallel firstprivate(team_size) num_threads(size)
-      {
-         const size_t tid = omp_get_thread_num();
-	 MyData value = (MyData)0;
-	 for (size_t k=0 ; k<size ; k++)
-	   value += (A[team_size + k] * B[(k * size) + tid]);
-
-	 C[team_size + tid] = value;
-      } // omp threads      
-    } // omp teams
-  } // omp target
+      // Cblock initialization
+      Cblock[(i_local * _BLOCK_) + j_local] = (MyData)0;
+
+      // loop over blocks
+      for (size_t block=0 ; block<Nblocks ; block++)
+	{
+	  const size_t j_A = j_local + (block * _BLOCK_);
+	  const size_t i_B = i_local + (block * _BLOCK_);
+
+	  // waits until all threads in the thread block have reached this point and shared memory accesses
+	  #pragma omp barrier
+
+	  Ablock[(i_local * _BLOCK_) + j_local] = A[(i_global * size) + j_A];
+	  Bblock[(i_local * _BLOCK_) + j_local] = B[(i_B * size)      + j_global];
+
+	  // waits until all threads in the thread block have reached this point and shared memory accesses
+	  #pragma omp barrier
 
+	  // perform the matrix multiplication within the block
+	  MyData value = (MyData)0;
+	  for (size_t k=0 ; k<_BLOCK_ ; k++)
+	    value += Ablock[(i_local * _BLOCK_) + k] * Bblock[(k * _BLOCK_) + j_local];
+
+	  // store the partial result in shared memory
+	  Cblock[(i_local * _BLOCK_) + j_local] += value;
+	} // block loop
+
+      // store the result into global memory
+      C[(i_global * size) + j_global] = Cblock[(i_local * _BLOCK_) + j_local];
+    } // omp parallel
+  } // target teams
+	  
   return;
 }
 
 void check(const MyData *const __restrict__ cpu_matrix,
 	   const MyData *const __restrict__ gpu_matrix)
 {
-  int flag;
+  int flag = 0;
   for (size_t i=0 ; i<SIZE ; i++)
-    flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);
+    flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);
 
   if (!flag)
     printf("\n\t Result OK");
   else
     printf("\n\t Result wrong");
+
+#if !defined(NDEBUG)
+
+  for (size_t i=0 ; i<N ; i++)
+    {
+      printf("\n");
+      for (size_t j=0 ; j<N ; j++)
+	{
+	  const size_t index = ((i * N) + j);
+	  printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg",
+		 index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index]));
+	}
+    }
+  printf("\n");
+
+#endif // NDEBUG  
   
   return;
 }
@@ -162,40 +331,57 @@ int main()
       B[i] = drand48();
     }
 
-  ////////////////////////// CPU naive algorithm //////////////////////////////////////////
-  CPU_mat_mult(A, B, C_CPU, N);
-  /////////////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////// CPU naive algorithm ///////////////////////////////////////////////////
+  CPU_mat_mult_block(A, B, C_CPU, N);
+  //////////////////////////////////////////////////////////////////////////////////////////////////
 
   // copy/alloc data to the GPU
   #pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE])
 
-  /////////////////////////// GPU naive algorithm ////////////////////////////////////////
+  /////////////////////////// GPU naive block algorithm ////////////////////////////////////////////
+  time = 0.0;
+  GPU_mat_mult_block(A, B, C_GPU, N); // warm-up
+  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+    {
+      const double start = wall_time();
+      GPU_mat_mult_block(A, B, C_GPU, N);
+      time += (wall_time() - start);
+    }
+  
+  #pragma omp target update from(C_GPU[0:SIZE])
+  check(C_CPU, C_GPU);
+  printf("\n\t GPU block time %lg [s]\n", (time / LOOP));
+  //////////////////////////////////////////////////////////////////////////////////////////////////
+
+  /////////////////////////// GPU naive block shared memory algorithm //////////////////////////////
   time = 0.0;
+  GPU_mat_mult_block_shared(A, B, C_GPU, N); // warm-up
   for (unsigned short int loop=0 ; loop<LOOP ; loop++)
     {
       const double start = wall_time();
-      GPU_mat_mult(A, B, C_GPU, N);
+      GPU_mat_mult_block_shared(A, B, C_GPU, N);
       time += (wall_time() - start);
     }
   
   #pragma omp target update from(C_GPU[0:SIZE])
   check(C_CPU, C_GPU);
-  printf("\n\t GPU naive time %lg [s]\n", (time / LOOP));
-  ////////////////////////////////////////////////////////////////////////////////
+  printf("\n\t GPU block pteam memory time %lg [s]\n", (time / LOOP));
+  //////////////////////////////////////////////////////////////////////////////////////////////////
 
-  /////////////////////////// GPU naive no loops algorithm ////////////////////////////
+  /////////////////////////// GPU naive block shared memory no loops algorithm /////////////////////
   time = 0.0;
+  GPU_mat_mult_block_shared_no_loops(A, B, C_GPU, N); // warm-up
   for (unsigned short int loop=0 ; loop<LOOP ; loop++)
     {
       const double start = wall_time();
-      GPU_mat_mult_no_loops(A, B, C_GPU, N);
+      GPU_mat_mult_block_shared_no_loops(A, B, C_GPU, N);
       time += (wall_time() - start);
     }
   
   #pragma omp target update from(C_GPU[0:SIZE])
   check(C_CPU, C_GPU);
-  printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP));
-  ////////////////////////////////////////////////////////////////////////////////
+  printf("\n\t GPU block no loops pteam memory time %lg [s]\n", (time / LOOP));
+  //////////////////////////////////////////////////////////////////////////////////////////////////
 
   // free CPU memory
   free(buffer);