diff --git a/cuda-omp/cuda/1/classwork_1.cu b/cuda-omp/cuda/1/classwork_1.cu
index 446b30002a646dcef0603b77b6bb81d2a17ca83c..5fe7a232349caa913fc4b590c025e54778b6af5d 100644
--- a/cuda-omp/cuda/1/classwork_1.cu
+++ b/cuda-omp/cuda/1/classwork_1.cu
@@ -17,9 +17,10 @@
 // Author: David Goz
 // mail  : david.goz@inaf.it
 // date  : 06.07.2024
+// code tested using nvhpc
 //
 // - Compile the code:
-//   $ nvcc classwork_1.cu -o classwork_1_cuda
+//   $ nvc++ classwork_1.cu -o classwork_1_cuda
 // - Run the code:
 //   $ ./classwork_1_cuda
 // - Check the result:
@@ -33,10 +34,23 @@
 #define N        100
 #define NThreads 1024
 
-__global__ void GPUkernel(const int size)
+__global__ void GPUkernelSerial(const int size)
 {
   const int myID = threadIdx.x + (blockIdx.x * blockDim.x);
-  
+
+  // C printf is supported on CUDA
+  // C++ cout class is not supported in CUDA
+  for (int i=0 ; i<size ; i++)
+    printf("Hello from CUDA thread: %d - result %d\n", myID, (i * i));
+
+  return;
+}
+
+__global__ void GPUkernelParallel(const int size)
+{
+  const int myID = threadIdx.x + (blockIdx.x * blockDim.x);
+
+  /* guard if the number of available threads is larger than size */
   if (myID >= size)
     return;
 
@@ -49,14 +63,17 @@ __global__ void GPUkernel(const int size)
 
 int main()
 {
-  printf("\n\t The host issues the kernel on the GPU \n");
-  
+  printf("\n\t The host issues the kernel on the GPU in serial \n");  
   // kernel lunch
-  GPUkernel<<<1, NThreads>>>(N);
-
-  printf("\n\t cudaDeviceSynchronize \n");
+  GPUkernelSerial<<<1, 1>>>(N);
   // GPU synchronization
   cudaDeviceSynchronize();
 
+  printf("\n\t The host issues the kernel on the GPU in parallel \n");
+  // kernel lunch
+  GPUkernelParallel<<<1, NThreads>>>(N);
+  // GPU synchronization
+  cudaDeviceSynchronize();
+  
   return 0;
 }
diff --git a/cuda-omp/cuda/1/classwork_2.cu b/cuda-omp/cuda/1/classwork_2.cu
index cae80988170dbcdbf3468ad0b93d5777397d8ba2..c086bad2bf994f0b24861156cb28e801a75bcb44 100644
--- a/cuda-omp/cuda/1/classwork_2.cu
+++ b/cuda-omp/cuda/1/classwork_2.cu
@@ -18,10 +18,11 @@
 //////////////////////////////////////////////////////////////////////////////////////////////////
 // Author: David Goz
 // mail  : david.goz@inaf.it
-// date  : 17.11.2022
+// date  : 06.07.2024
+// code tested using nvhpc
 //
 // - Compile the code:
-//   $ nvcc classwork_2.cu -o classwork_2
+//   $ nvc++ classwork_2.cu -o classwork_2
 // - Run the code:
 //   $ ./classwork_2
 // - Check the result:
@@ -29,9 +30,10 @@
 
 //////////////////////////////////////////////////////////////////////////////////////////////////
 
-#include <iostream>
 #include <stdlib.h>
+#include <stdio.h>
 #include <cuda.h>
+#include <assert.h>
 
 #define N        100
 #define NThreads 1024
@@ -49,31 +51,35 @@ __global__ void GPUkernel(      int *A,
 
 int main()
 {
-  // allocate array that allows direct access of both host and device
-  //  CUDA is responsible 
-  int *A;
-  const size_t size = (N * sizeof(int));
-  cudaError error = cudaMallocManaged(&A, size);
-  if (!error)
-    std::cout << "Memory allocated for the host/device" << std::endl;
-  else
-    {
-      std::cout << "Cannot allocate memory for the host/device. CUDA error : " << error << " ... aborting" << std::endl;
-      exit(EXIT_FAILURE);
-    }
+  /* host array */
+  int *h_A = (int *)malloc(N * sizeof(*h_A));
+  assert(h_A != NULL);
+
+  /* device array */
+  int *d_A = NULL;
+  cudaMalloc((void **)&d_A, (N * sizeof(*d_A)));
+  assert(d_A != NULL);
   
   // kernel lunch
-  GPUkernel<<<1, NThreads>>>(A, N);
+  GPUkernel<<<1, NThreads>>>(d_A, N);
   
   // device synchronization
   cudaDeviceSynchronize();
+
+  // fetch device memory
+  cudaMemcpy(h_A, d_A, (sizeof(*h_A) * N), cudaMemcpyDeviceToHost);
+
+  // free GPU memory
+  cudaFree(d_A);
   
   // check the result
+  printf("\n");
   for (size_t i=0 ; i<N ; i++)
-    std::cout << "A[" << i << "] - Result: " << A[i] << std::endl;
+    printf("\t A[%d] = %d", i, h_A[i]);
+  printf("\n\n");
 
-  // free the memory
-  cudaFree(A);
+  // free host memory
+  free(h_A);
   
   return 0;
 }
diff --git a/cuda-omp/cuda/2/classwork.cu b/cuda-omp/cuda/2/classwork.cu
new file mode 100644
index 0000000000000000000000000000000000000000..352a730f6cc7f206155c206fc8e97faf22bf8105
--- /dev/null
+++ b/cuda-omp/cuda/2/classwork.cu
@@ -0,0 +1,107 @@
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Assigment : write a CUDA code corresponding to the
+// following sequential C code
+//
+// #include <stdio.h>
+// #define ROW 4
+// #define COL 8
+// int main()
+// {
+//   int Matrix[ROW * COL];
+//
+//   for (int row=0 ; row<ROW ; row++)
+//      for (int col=0 ; col<COL ; col++)
+//         Matrix[(row * COL) + col] = ((row * COL) + col);
+//
+//   return 0;
+// }
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 06.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc++ classwork.cu -o classwork
+// - Run the code:
+//   $ ./classwork
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include <stdlib.h>
+#include <cuda.h>
+#include <assert.h>
+
+#define BLOCKSIZE 32
+#define ROW 1089
+#define COL 1111
+
+typedef int MyData;
+
+__global__ void GPUMatrix(MyData *Matrix,
+			  const int size)
+{
+  /* global thread index */
+  const int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+  if (index < size)
+    Matrix[index] = index;
+
+  return;
+}
+
+int main()
+{
+  /* host matrix . 1D mapping matrix[i][j] ---> matrix[(i * COL) + j] */
+  MyData *h_matrix = (MyData *)malloc(ROW * COL * sizeof(*h_matrix));
+  assert(h_matrix != NULL);
+
+  /* device matrix */
+  MyData *d_matrix = NULL;
+  cudaMalloc((void **)&d_matrix, ROW * COL * sizeof(*d_matrix));
+  assert(d_matrix != NULL);
+  
+  /* 1D grid  */
+  const dim3 grid = {(((ROW * COL) + BLOCKSIZE - 1) / BLOCKSIZE), // number of blocks along X
+		     1,                                           // number of blocks along Y
+		     1};                                          // number of blocks along Z
+  const dim3 block = {BLOCKSIZE, // number of threads per block along X
+		      1,         // number of threads per block along Y
+		      1};        // number of threads per block along Z
+  
+  // kernel
+  GPUMatrix<<<grid, block>>>(d_matrix, (ROW * COL));
+  
+  // device synchronization
+  cudaDeviceSynchronize();
+
+  /* fetch matrix from the device */
+  cudaMemcpy(h_matrix, d_matrix, (ROW * COL * sizeof(*d_matrix)), cudaMemcpyDeviceToHost);
+
+  /* free device memory */
+  cudaFree(d_matrix);
+  
+  // check the result
+  int flag = 0;
+  for (size_t row=0 ; row<ROW ; row++)
+    {
+      const size_t i = (row * COL);
+      
+      for (size_t col=0 ; col<COL ; col++)
+      {
+	flag = ((h_matrix[i + col] != (i + col)) ? -1 : flag);
+      } /* col loop */
+    } /* row loop */
+
+  // free host memory
+  free(h_matrix);
+
+  if (flag)
+    printf("\n\t Result wrong! \n\n");
+  else
+    printf("\n\t Result OK! \n\n");
+  
+  return 0;
+}
diff --git a/cuda-omp/cuda/3/classwork.cu b/cuda-omp/cuda/3/classwork.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4cc27287f31d730cd89e2988b84aa7ad90384d9e
--- /dev/null
+++ b/cuda-omp/cuda/3/classwork.cu
@@ -0,0 +1,145 @@
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Assigment : write a CUDA code that does:
+//             - each thread generates a pair of points as (x, y) coordinates randomly distributed;
+//             - each thread computes the euclidean distance d = sqrt((x1 - x2)^2 + (y1 - y2)^2);
+//             - the kernel prints the maximum distance;
+//             - use one CUDA block and shared memory within the block
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 06.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc++ classwork.cu -o classwork -lm
+// - Run the code:
+//   $ ./classwork
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <cuda.h>
+#include <assert.h>
+
+#define NUMPOINTS  1024
+#define BLOCKSIZE  NUMPOINTS
+
+#if NUMPOINTS != BLOCKSIZE
+#error NUMPOINTS must be equal to BLOCKSIZE
+#endif
+
+#if BLOCKSIZE > 1024
+#error BLOCKSIZE cannot be larger than 1024
+#endif
+
+#define SQUARE(A)      ((A) * (A))
+
+typedef double MyData;
+
+typedef struct PairPoints
+{
+  MyData x[2];
+  MyData y[2];
+} PairPoints;
+
+__global__ void GPUDistance(const PairPoints *const points,
+			    const int               size)
+{
+  /* global thread's ID */
+  const int globalID = (threadIdx.x + (blockIdx.x * blockDim.x));
+  /* local (i.e. within the block) thread's ID */
+  const int localID  = threadIdx.x;
+
+  if ((globalID >= size) || (globalID != localID))
+    return;
+
+  /* coalescent memory accesses */
+  const PairPoints myPoints = points[localID];
+  const MyData pair_distance_X2 = SQUARE(myPoints.x[0] - myPoints.x[1]);
+  const MyData pair_distance_Y2 = SQUARE(myPoints.y[0] - myPoints.y[1]);  
+  const MyData pair_distance    = sqrt(pair_distance_X2 + pair_distance_Y2);
+
+  // shared-block memory statically allocated
+  __shared__ MyData distance[BLOCKSIZE];
+  
+  /* store the distance in shared memory */
+  distance[localID] = pair_distance;
+  // block level synchronization barrier
+  __syncthreads();
+
+  /* the master thread within the block gets the max */
+  if (localID == 0)
+    {
+      MyData max_dis = -1.0;
+      for (size_t i=0 ; i<size ; i++)
+	max_dis = ((max_dis < distance[i]) ? distance[i] : max_dis);
+
+      printf("\t GPU maximum distance: %lg\n", max_dis);
+    }
+  
+  return;
+}
+
+void CPUMaxDistance(const PairPoints *const points,
+		    const        int        size)
+{
+  MyData distance = -1.0;
+  for (size_t i=0 ; i<size ; i++)
+    {
+      const MyData pair_distance_X2 = SQUARE(points[i].x[0] - points[i].x[1]);
+      const MyData pair_distance_Y2 = SQUARE(points[i].y[0] - points[i].y[1]);
+      const MyData pair_distance    = sqrt(pair_distance_X2 + pair_distance_Y2);
+
+      distance = ((distance < pair_distance) ? pair_distance : distance);
+    }
+
+  printf("\n\t CPU maximum distance: %lg\n", distance);
+  
+  return;
+}
+
+int main()
+{
+  /* host allocation */
+  PairPoints *h_points = (PairPoints *)malloc(NUMPOINTS * sizeof(*h_points));
+  assert(h_points != NULL);
+  
+  /* device allocation */
+  PairPoints *d_points = NULL;
+  cudaMalloc((void **)&d_points, (NUMPOINTS * sizeof(*d_points)));
+
+  /* initialization */
+  srand48(time(NULL));
+  for(size_t i=0 ; i<NUMPOINTS ; i++)
+    {
+      h_points[i].x[0] = drand48();
+      h_points[i].x[1] = drand48();
+      h_points[i].y[0] = drand48();
+      h_points[i].y[1] = drand48();
+    }
+
+  /* copy data to the device's memory */
+  cudaMemcpy(d_points, h_points, (NUMPOINTS * sizeof(*d_points)), cudaMemcpyHostToDevice);
+  
+  const dim3 grid  = {1, 1, 1};
+  const dim3 block = {BLOCKSIZE, 1, 1};
+  
+  // lunch kernel
+  GPUDistance<<< grid, block >>>(d_points, NUMPOINTS);
+
+  // check the result on the host while the kernel is executing on the GPU
+  CPUMaxDistance(h_points, NUMPOINTS);
+
+  free(h_points);
+  
+  // device synchronization
+  cudaDeviceSynchronize();
+
+  // free memory
+  cudaFree(d_points);
+  
+  return 0;
+}
diff --git a/cuda-omp/cuda/4/classwork.cu b/cuda-omp/cuda/4/classwork.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ac70f359fe8f45ac930c8c295e636140dd6aa949
--- /dev/null
+++ b/cuda-omp/cuda/4/classwork.cu
@@ -0,0 +1,178 @@
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Assigment : write a CUDA code corresponding to the
+// following sequential C code
+//
+// #include <stdio.h>
+// int main()
+// {
+//   /* Matrix multiplication: C = A X B */
+//
+//   int A[N][N], B[N][N], C[N][N];
+//
+//   for (int ii=0 ; ii<N ; ii++)
+//      for (int jj=0 ; jj<N ; jj++)
+//         for (int kk=0 ; kk<N ; kk++)
+//            C[ii][jj] += A[ii][kk] * B[kk][jj];
+//
+//   return 0;
+// }
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 08.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc++ classwork.cu -o classwork
+// - Run the code:
+//   $ ./classwork <N>
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+#include <assert.h>
+
+#define BLOCKSIZE 32
+#define SQUARE(SIZE) ((SIZE) * (SIZE))
+
+typedef long int MyData;
+
+void matrix_init(      MyData *const AB,
+		 const MyData        N)
+{
+  MyData *const restrict A = AB;
+  MyData *const restrict B = AB + SQUARE(N);
+  
+  for (MyData ii=0 ; ii<N ; ii++)
+    for (MyData jj=0 ; jj<N ; jj++)
+      {
+	A[(ii * N) + jj] = (ii - jj);
+	B[(ii * N) + jj] = 1;
+      }
+  
+  return;
+}
+
+__global__ void GPU_MM(const MyData *const __restrict__ A,
+		       const MyData *const __restrict__ B,
+		             MyData *const __restrict__ C,
+		       const MyData                     N)
+{
+  const MyData IDx = threadIdx.x + (blockIdx.x * blockDim.x);
+  const MyData IDy = threadIdx.y + (blockIdx.y * blockDim.y);
+
+  /* check out of boundaries */
+  if ((IDx >= N) || (IDy >= N))
+    return;
+
+  /* each thread performs the calculation of one element */
+  /* of the matrix, i.e. C[IDx][IDy]                     */
+  
+  MyData sum = 0;
+  for (MyData kk=0 ; kk<N ; kk++)
+    sum += (A[(IDx * N) + kk] * B[(kk * N) + IDy]);
+
+  C[(IDx * N) + IDy] = sum;
+  
+  return;
+}
+
+void check(const MyData *const __restrict__ GPU_MM,
+	   const MyData *const __restrict__ A,
+	   const MyData                     N)
+{
+  int flag = 0;
+
+  for (MyData ii=0 ; ii<N ; ii++)
+    {
+      MyData row_a = 0;      
+      for (MyData kk=0 ; kk<N ; kk++)
+	row_a += A[(ii * N) + kk];
+
+      for (MyData jj=0 ; jj<N ; jj++)
+	if (GPU_MM[(ii * N) + jj] != row_a)
+	  {
+	    flag = 1;
+	    break;
+	  }
+    }
+
+  if (flag)
+    printf("\n\t Result WRONG \n\n");
+  else
+    printf("\n\t Result OK \n\n");
+  
+  return;
+}
+
+int main(int arg, char *argv[])
+{
+  int N;
+  
+  if (arg < 2)
+    {
+      printf("\n\t Usage: $ ./classwork <number_of_matrix_elements> \n");
+      exit(EXIT_FAILURE);
+    }
+  else
+    {
+      N = atoi(argv[1]);
+      if (N <= 0)
+	{
+	  printf("\n\t Number of matrix elements must be greater than zero \n");
+	  exit(EXIT_FAILURE);
+	}
+      else
+	{
+	  printf("\n\t Matrix size    : %d x %d", N, N);
+	  printf("\n\t CUDA block size: %d x %d", BLOCKSIZE, BLOCKSIZE);
+	}
+    }
+
+  /* host memory allocation */
+  MyData *h_buffer = (MyData *)malloc(3 * SQUARE(N) * sizeof(MyData));
+  assert(h_buffer != NULL);
+  /* set-up host pointers */
+  MyData *const restrict h_A = h_buffer;
+  MyData *const restrict h_B = h_A + SQUARE(N);
+  MyData *const restrict h_C = h_B + SQUARE(N);
+
+  /* device memory allocation */
+  MyData *d_buffer = NULL;
+  cudaMalloc((void **)&d_buffer, (3 * SQUARE(N) * sizeof(MyData)));
+  assert(d_buffer != NULL);
+  /* set-up device pointers */
+  MyData *const restrict d_A = d_buffer;
+  MyData *const restrict d_B = d_A + SQUARE(N);
+  MyData *const restrict d_C = d_B + SQUARE(N);
+
+  // matrix initialization
+  matrix_init(h_buffer, N);
+
+  /* copy host data to device */
+  cudaMemcpy(d_A, h_A, (2 * SQUARE(N) * sizeof(MyData)), cudaMemcpyHostToDevice);
+  
+  // kernel lunch on GPU
+  const size_t nblocks = ((N + BLOCKSIZE - 1) / BLOCKSIZE);
+  const dim3 grid      = {nblocks, nblocks, 1};
+  const dim3 block     = {BLOCKSIZE, BLOCKSIZE, 1};
+  GPU_MM<<< grid, block >>>(d_A, d_B, d_C, N);
+
+  /* host-device data synchronization         */
+  /* N.B. cudaDeviceSynchronize() is not      */
+  /* necessary since cudaMemcpy() is blocking */
+  cudaMemcpy(h_C, d_C, (SQUARE(N) * sizeof(MyData)), cudaMemcpyDeviceToHost);
+
+  // free the GPU memory
+  cudaFree(d_buffer);
+
+  // check the result
+  check(h_C, h_A, N);
+
+  free(h_buffer);
+  
+  return 0;
+}
diff --git a/cuda-omp/cuda/5/aos_soa.cu b/cuda-omp/cuda/5/aos_soa.cu
new file mode 100644
index 0000000000000000000000000000000000000000..74c024b74d5101b63547b877f75cc46e380dbaba
--- /dev/null
+++ b/cuda-omp/cuda/5/aos_soa.cu
@@ -0,0 +1,239 @@
+////////////////////////////////////////////////////////////////////////////////////////////////
+// $ The program tests the SoA vs AoS approaches on GPU
+// In computing, an array of structures (AoS), structure of arrays (SoA) or
+// array of structures of arrays (AoSoA) are contrasting ways to arrange a sequence
+// of records in memory, with regard to interleaving, and are of interest in SIMD
+// and SIMT programming.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 12.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc++ aos_soa.cu -o aos_soa
+// - Run the code:
+//   $ ./aos_soa
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include <vector>
+#include <unistd.h>
+#include <time.h>
+#include <cuda.h>
+
+#define Byte_to_MB (1.0 / (1024 * 1024))
+#define LOOP       100
+#define BLOCKSIZE  1024
+#if BLOCKSIZE > 1024
+#error BLOCKSIZE cannot be larger than 1024
+#endif
+
+// datatypes
+struct nodeAoS
+{
+  double   a;
+  float    b;
+  long int c;
+  int      d;
+  char     e;
+};
+
+struct nodeSoA
+{
+  double   *a;
+  float    *b;
+  long int *c;
+  int      *d;
+  char     *e;
+};
+
+void GPU_alloc_check(const cudaError error)
+{
+  using namespace std;
+  
+  if (error)
+    {
+      cout << "\t cudaMalloc fails! ... aborting ..." << endl;
+      exit(EXIT_FAILURE);
+    }
+  
+  return;
+}
+
+double wall_time()
+{
+  struct timespec ts;
+  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
+
+  return ret;
+}
+
+__global__ void gpu_AoS(struct nodeAoS *const allnodeAoS,
+			const size_t          size)
+{
+  const size_t ID = threadIdx.x + (blockIdx.x * blockDim.x);
+
+  if (ID >= size)
+    return;
+
+  // warp non-coalescing access ---> stride = sizeof(struct nodeAoS)
+  allnodeAoS[ID].a = (double)ID;
+  allnodeAoS[ID].b = (float)ID;
+  allnodeAoS[ID].c = (long int)ID;
+  allnodeAoS[ID].d = (int)ID;
+  allnodeAoS[ID].e = 'a';
+  
+  return;
+}
+
+__global__ void gpu_nodeAoS(struct nodeAoS *const allnodeAoS,
+			    const size_t          size)
+{
+  const size_t ID = threadIdx.x + (blockIdx.x * blockDim.x);
+
+  const struct nodeAoS node = {(double)ID, (float)ID, (long int)ID, (int)ID, 'a'};
+
+  if (ID >= size)
+    return;
+
+  // warp coalescing access ---> stride = 0
+  allnodeAoS[ID] = node;
+
+  return;
+}
+
+__global__ void gpu_SoA(double   *const __restrict__ a,
+			float    *const __restrict__ b,
+			long int *const __restrict__ c,
+			int      *const __restrict__ d,
+			char     *const __restrict__ e,
+			const size_t                 size)
+{
+  const size_t ID = threadIdx.x + (blockIdx.x * blockDim.x);
+
+  if (ID >= size)
+    return;
+
+  // warp coalescing access ---> stride = 0
+  a[ID] = (double)ID;
+  b[ID] = (float)ID;
+  c[ID] = (long int)ID;
+  d[ID] = (int)ID;
+  e[ID] = 'a';
+  
+  return;
+}
+
+size_t get_gpu_free_memory()
+{
+  using namespace std;
+  
+  int num_gpus{0};
+  // get the number of available gpus on the system
+  cudaGetDeviceCount(&num_gpus);
+
+  vector<size_t>free_memory(num_gpus);
+  vector<size_t>total_memory(num_gpus);
+  for (size_t gpu_id=0 ; gpu_id<num_gpus ; gpu_id++)
+    {
+      cudaSetDevice(gpu_id);
+      cudaMemGetInfo(&free_memory[gpu_id], &total_memory[gpu_id]);
+      cout << "\n\t GPU " << gpu_id << " - free memory: " << free_memory[gpu_id]*Byte_to_MB << "[MB] - total memory: " << total_memory[gpu_id]*Byte_to_MB << " [MB]" << endl;
+    }
+
+  // return the free memory of the first GPU
+  return free_memory[0];
+}
+    
+int main()
+{
+  using namespace std;
+
+  // free gpu mem in MB
+  const size_t gpu_free_mem = get_gpu_free_memory();
+  // number of elements in the AoS
+  const size_t N = (gpu_free_mem / 2 / sizeof(nodeAoS));
+  
+  double start_time{0};
+  double end_time{0};
+
+  const dim3 block{BLOCKSIZE, 1, 1};
+  const dim3 nblocks{((N + BLOCKSIZE - 1) / BLOCKSIZE), 1, 1};
+  
+  // allocate buffer for the GPU to store AoS and SoA
+  nodeAoS *gpu_allnodeAoS{nullptr};
+  GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeAoS, (N * sizeof(*gpu_allnodeAoS))));
+
+  nodeSoA gpu_allnodeSoA = {nullptr, nullptr, nullptr, nullptr, nullptr};
+  GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeSoA.a, (N * sizeof(*gpu_allnodeSoA.a))));
+  GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeSoA.b, (N * sizeof(*gpu_allnodeSoA.b))));
+  GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeSoA.c, (N * sizeof(*gpu_allnodeSoA.c))));
+  GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeSoA.d, (N * sizeof(*gpu_allnodeSoA.d))));
+  GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeSoA.e, (N * sizeof(*gpu_allnodeSoA.e))));
+
+  // gpu warm-up
+  gpu_AoS<<< nblocks, block >>>(gpu_allnodeAoS, N);
+  cudaDeviceSynchronize();
+
+  ////////////////////////// GPU AoS /////////////////////////////////////////////
+  start_time = wall_time();
+  for (int loop=0 ; loop<LOOP ; loop++)
+    gpu_AoS<<< nblocks, block >>>(gpu_allnodeAoS, N);
+
+  cudaDeviceSynchronize();
+  end_time = wall_time();
+  cout << "\t GPU AoS     time: " << ((end_time - start_time) / static_cast<double>(LOOP)) << " [s]" << endl;
+  ////////////////////////////////////////////////////////////////////////////////
+
+  // gpu warm-up
+  gpu_nodeAoS<<< nblocks, block >>>(gpu_allnodeAoS, N);
+  cudaDeviceSynchronize();
+  
+  ////////////////////////// GPU nodeAoS /////////////////////////////////////////
+  start_time = wall_time();
+  for (int loop=0 ; loop<LOOP ; loop++)
+      gpu_nodeAoS<<< nblocks, block >>>(gpu_allnodeAoS, N);
+
+  cudaDeviceSynchronize();
+  end_time = wall_time();
+  cout << "\t GPU nodeAoS time: " << ((end_time - start_time) / static_cast<double>(LOOP)) << " [s]" << endl;
+  ////////////////////////////////////////////////////////////////////////////////
+
+  // gpu warm-up
+  gpu_SoA<<< nblocks, block >>>(gpu_allnodeSoA.a,
+				gpu_allnodeSoA.b,
+				gpu_allnodeSoA.c,
+				gpu_allnodeSoA.d,
+				gpu_allnodeSoA.e,
+				N);
+  cudaDeviceSynchronize();
+  
+  ////////////////////////// GPU SoA /////////////////////////////////////////////
+  start_time = wall_time();
+  for (int loop=0 ; loop<LOOP ; loop++)
+    gpu_SoA<<< nblocks, block >>>(gpu_allnodeSoA.a,
+				  gpu_allnodeSoA.b,
+				  gpu_allnodeSoA.c,
+				  gpu_allnodeSoA.d,
+				  gpu_allnodeSoA.e,
+				  N);
+
+  cudaDeviceSynchronize();
+  end_time = wall_time();
+  cout << "\t GPU SoA     time: " << ((end_time - start_time) / static_cast<double>(LOOP)) << " [s]\n" << endl;
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  // free GPU memory
+  cudaFree(gpu_allnodeAoS);
+  cudaFree(gpu_allnodeSoA.a);
+  cudaFree(gpu_allnodeSoA.b);
+  cudaFree(gpu_allnodeSoA.c);
+  cudaFree(gpu_allnodeSoA.d);
+  cudaFree(gpu_allnodeSoA.e);
+  
+  return EXIT_SUCCESS;
+}
diff --git a/cuda-omp/cuda/5/linked_list.cu b/cuda-omp/cuda/5/linked_list.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f7d277d71cace5b02bd01a832e7c6334b9d2e005
--- /dev/null
+++ b/cuda-omp/cuda/5/linked_list.cu
@@ -0,0 +1,208 @@
+////////////////////////////////////////////////////////////////////////////////////////////////
+// $ The program shows how to port a linked list, created by the CPU, onto the GPU
+//
+// nvcc compiler might issue the warning: Stack size for entry function … cannot be statically determined).
+// The reason is the recursion function printLinkedList.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 12.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc++ linked_list.cu -o linked_list
+// - Run the code:
+//   $ ./linked_list
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <iostream>
+#include <unistd.h>
+#include <time.h>
+#include <cuda.h>
+#include <new>
+#include <cassert>
+#include <vector>
+
+#define N          10 // linked list size
+#define BLOCKSIZE  1024
+#define CPU        0
+#define GPU        1
+#if (BLOCKSIZE > 1024)
+#error BLOCKSIZE cannot be larger than 1024
+#endif
+
+// datatypes
+struct node
+{
+  struct node *next;
+  int data;
+};
+
+typedef struct node   node;
+typedef struct node * nodeptr;
+
+// container where to store node pointers on the GPU
+static std::vector<nodeptr> GPUptr;
+
+void GPU_check(const cudaError error,
+	       const char      *cudaFunc)
+{
+  using namespace std;
+  
+  if (error)
+    {
+      cout << "\t " << cudaFunc << " fails! ... aborting ..." << endl;
+      exit(EXIT_FAILURE);
+    }
+  
+  return;
+}
+
+double wall_time()
+{
+  struct timespec ts;
+  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
+
+  return ret;
+}
+
+nodeptr createNode(const int data)
+{
+  nodeptr pn{nullptr};
+  pn = new struct node;
+
+  assert(pn != nullptr);
+  
+  pn->data = data;
+  pn->next = nullptr;
+
+  return pn;
+}
+
+nodeptr createLinkedList()
+{
+  nodeptr root{nullptr};
+
+  for (int size=N ; size>0 ; size--)
+    {
+      nodeptr pn = createNode(size);
+      pn->next = root;
+      root = pn;
+    }
+
+  assert(root != nullptr);
+  
+  return root;
+}
+
+void freeCpuLinkedList(nodeptr cpu_node)
+{
+  // free CPU linked list
+  if (cpu_node)
+    {
+      freeCpuLinkedList(cpu_node->next);
+
+      delete cpu_node;
+      cpu_node = nullptr;
+    }
+  
+  return;
+}
+
+void freeGpuLinkedList()
+{
+  // free GPU linked list
+  for (auto &el : GPUptr)
+    {
+      cudaFree(el);
+      el = nullptr;
+    }
+
+  return;
+}
+
+nodeptr copyNode(const nodeptr pn)
+{
+  assert(pn != nullptr);
+  
+  nodeptr gpu_node{nullptr};
+  GPU_check(cudaMalloc((void **)&gpu_node, sizeof(*gpu_node)), "cudaMalloc");
+  GPU_check(cudaMemcpy(gpu_node, pn, sizeof(*pn), cudaMemcpyHostToDevice), "cudaMemcpy");
+
+  GPUptr.push_back(gpu_node);
+  
+  return gpu_node;
+}
+
+nodeptr copyList(const nodeptr pn)
+{
+  if (!pn)
+    return nullptr;
+  
+  node buffer;
+  buffer.next = copyList(pn->next);
+  buffer.data = pn->data;
+
+  return copyNode(&buffer);
+}
+
+// the function works on both CPU and GPU
+__device__ __host__ void printLinkedList(const nodeptr root,
+					 const int     device)
+{
+  const char string[2][16] = {"\n\t CPU ---> ", "\n\t GPU ---> "};
+
+  if (root)
+    {
+      printf("%s", string[device]);
+      printf("Node->data %d", root->data);
+      printLinkedList(root->next, device);
+    }
+  else
+    printf("\n");
+
+  return;
+}
+
+__global__ void GpuPrintLinkedList(const nodeptr root)
+{
+  const unsigned int ID = threadIdx.x + (blockIdx.x * blockDim.x);
+
+  // master thread prints the linked list
+  if (!ID)
+    printLinkedList(root, GPU);
+
+  return;
+}
+
+int main()
+{
+  using namespace std;
+
+  // create linked list on the CPU
+  nodeptr head = createLinkedList();
+
+  // print linked list on the CPU
+  printLinkedList(head, CPU);
+
+  // copy linked list onto the GPU
+  nodeptr gpu_head = copyList(head);
+
+  // print linked list on the GPU
+  GpuPrintLinkedList<<< 1, BLOCKSIZE >>>(gpu_head);
+  
+  // GPU syncronization
+  cudaDeviceSynchronize();
+
+  // free CPU memory
+  freeCpuLinkedList(head);
+
+  // free GPU memory
+  freeGpuLinkedList();
+  
+  return EXIT_SUCCESS;
+}
diff --git a/cuda-omp/cuda/6/classwork.cu b/cuda-omp/cuda/6/classwork.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d2559ffc50b4c6d225fa9e3c41f4ad9e6e007aa7
--- /dev/null
+++ b/cuda-omp/cuda/6/classwork.cu
@@ -0,0 +1,273 @@
+////////////////////////////////////////////////////////////////////////////////////////////////
+// - You are given 1024 x 1024 integer matrix M;
+// - Each row is assigned to a thread-block;
+// - Each thread is assigned a matrix element M[i][j];
+// - It changes M[i][j] to M[i][j] + M[i][j+1] (where possible);
+// - Exploit shared-memory.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 12.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc++ classwork.cu -o classwork
+// - Run the code:
+//   $ ./classwork.cu
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <math.h>
+#include <assert.h>
+#include <cuda.h>
+
+#define N                     512
+#define SIZE                  (N * N) // matrix size
+typedef int64_t MyData;               // do not change
+#define BLOCKSIZE             N       // number of threads per block
+
+// sanity check
+#if BLOCKSIZE > 1024
+#error BLOCKSIZE cannot be larger than 1024
+#endif
+
+#if BLOCKSIZE != N
+#error BLOCKSIZE must be equal to N
+#endif
+
+#define LOOP 100
+
+double wall_time()
+{
+  struct timespec ts;
+  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
+
+  return ret;
+}
+
+__global__ void gpu_copy(const MyData *const __restrict__ A,
+			       MyData *const __restrict__ B)
+{
+  const size_t globalID = (threadIdx.x + (blockIdx.x * blockDim.x));
+
+  if (globalID >= SIZE)
+    return;
+
+  B[globalID] = A[globalID];
+
+  return;
+}
+
+__global__ void gpu_matrix(      MyData *const __restrict__ matrix,
+			   const MyData *const __restrict__ buffer)
+{
+  // global thread ID
+  const size_t globalID = (threadIdx.x + (blockIdx.x * blockDim.x));
+  
+  if (globalID >= SIZE)
+    return;
+
+  // matrix row
+  const size_t row = (globalID / N);
+  //matrix column
+  const size_t col = (globalID % N);
+
+  MyData myValue = buffer[(row * N) + col];
+
+  if (col < (N - 1))
+    myValue += buffer[(row * N) + col + 1];
+  
+  matrix[globalID] = myValue;
+  
+  return;
+}
+
+__global__ void gpu_matrix_shared(MyData *const __restrict__ matrix)
+{  
+  // thread-id within the thread-block
+  const size_t localID  = threadIdx.x;
+  // global-thread-id
+  const size_t globalID = (threadIdx.x + (blockIdx.x * blockDim.x));
+
+  if (globalID >= SIZE)
+    return;
+
+  // shared memory buffer
+  __shared__ MyData buffer[BLOCKSIZE + 1];
+  // shared memory initialization
+  for (size_t i=localID ; i<(BLOCKSIZE + 1) ; i+=blockDim.x)
+    buffer[i] = 0;
+  
+  const size_t row   = (globalID / N);
+  const size_t col   = (globalID % N);
+  const size_t index = ((row * N) + col);
+
+  // load data into shared memory
+  buffer[localID] = matrix[index];
+
+  // block-thread synchronization
+  __syncthreads();
+
+  // perform the calculation
+  const MyData value = (buffer[localID] + buffer[localID + 1]);
+
+  // store data
+  matrix[index] = value;
+  
+  return;
+}
+
+void GPU_matrix(MyData *const __restrict__ matrix,
+		MyData *const __restrict__ buffer,
+		const dim3                 nblocks,
+		const dim3                 block)
+{
+  // gpu_copy: gpu_A ---> gpu_buffer
+  gpu_copy<<< nblocks, block >>>(matrix, buffer);
+
+  // perform the actual calculation M[i][j] += M[i][j+1]
+  gpu_matrix<<< nblocks, block >>>(matrix, buffer);
+
+  // device synchronization
+  cudaDeviceSynchronize();
+  
+  return;
+}
+
+void GPU_matrix_shared(MyData *const __restrict__ matrix,
+		       const dim3                 nblocks,
+		       const dim3                 block)
+{
+  gpu_matrix_shared<<< nblocks, block >>>(matrix);
+  
+  // device synchronization
+  cudaDeviceSynchronize();
+  
+  return;
+}
+
+void CPU_matrix(MyData *const matrix)
+{
+  for (size_t i=0 ; i<N ; i++)
+    for (size_t j=0 ; j<N-1 ; j++)
+      matrix[(i * N) + j] += matrix[(i * N) + j + 1];
+
+  return;
+}
+
+void check(const MyData *const __restrict__ cpu_matrix,
+	   const MyData *const __restrict__ gpu_matrix,
+	   const char   *const __restrict__ msg)
+{
+  int flag;
+  for (size_t i=0 ; i<SIZE ; i++)
+    flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);
+
+  if (!flag)
+    printf("\n\t Result OK %s", msg);
+  else
+    printf("\n\t Result wrong %s", msg);
+  
+  return;
+}
+
+int main()
+{
+  double time;
+  // host reference matrix A
+  MyData *cpu_A = (MyData *)malloc(SIZE * sizeof(MyData));
+  assert (cpu_A != NULL);
+  for (size_t i=0 ; i<SIZE ; i++)
+    cpu_A[i] = (lrand48() % SIZE);
+
+  // device reference matrix A
+  MyData *gpu_A = NULL;
+  cudaMalloc((void **)&gpu_A, (SIZE * sizeof(MyData)));
+  assert(gpu_A != NULL);
+  cudaMemcpy(gpu_A, cpu_A, (SIZE * sizeof(MyData)), cudaMemcpyHostToDevice);
+
+  ////////////////////////// CPU MATRIX //////////////////////////////////////////
+  // allocate a second matrix where to perfom the actual computation
+  MyData *cpu_matrix = (MyData *)malloc(SIZE * sizeof(MyData));
+  assert(cpu_matrix != NULL);
+
+  // perform the actual computation
+  time = 0.0;
+  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+    {
+      memcpy(cpu_matrix, cpu_A, (SIZE * sizeof(*cpu_matrix)));
+      
+      const double start = wall_time();
+      CPU_matrix(cpu_matrix);
+      time += (wall_time() - start);
+    }
+  printf("\n\t CPU time %lg [s]\n", (time / LOOP));
+  ////////////////////////////////////////////////////////////////////////////////
+
+  ///////////////////////////// GPU MATRIX ///////////////////////////////////////
+  // allocate a second matrix where to perform the actual computation
+  MyData *gpu_matrix = NULL;
+  cudaMalloc((void **)&gpu_matrix, (SIZE * sizeof(MyData)));
+
+  // allocate a buffer matrix useful to copy the matrix before the computation
+  // in order to avoid thread race-conditions
+  MyData *gpu_buffer = NULL;
+  cudaMalloc((void **)&gpu_buffer, (SIZE * sizeof(MyData)));
+
+  const dim3 block   = {BLOCKSIZE, 1, 1};
+  const dim3 nblocks = {((SIZE + BLOCKSIZE - 1) / BLOCKSIZE), 1, 1};
+  
+  time = 0.0;
+  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+    {
+      gpu_copy<<< nblocks, block>>>(gpu_A, gpu_matrix);
+      cudaDeviceSynchronize();
+      
+      double start = wall_time();
+      GPU_matrix(gpu_matrix, gpu_buffer, nblocks, block);
+      time += (wall_time() - start);
+    }
+
+  // copy gpu_matrix into cpu_A
+  cudaMemcpy(cpu_A, gpu_matrix, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
+
+  check(cpu_matrix, cpu_A, "\t global memory implementation --->");
+  printf("\n\t GPU time  %lg [s]\n", (time / LOOP));
+  //////////////////////////////////////////////////////////////////////////////////////////
+
+  ////////////////////////// GPU MATRIX shared memory //////////////////////////////////////
+  // gpu_buffer is not longer required
+  cudaFree(gpu_buffer);
+  
+  time = 0.0;
+  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+    {
+      // restore data into gpu_matrix
+      gpu_copy<<< nblocks, block>>>(gpu_A, gpu_matrix);
+      cudaDeviceSynchronize();
+
+      double start = wall_time();
+      GPU_matrix_shared(gpu_matrix, nblocks, block);
+      time += (wall_time() - start);
+    }
+
+  // copy gpu_matrix into cpu_A
+  cudaMemcpy(cpu_A, gpu_matrix, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
+  check(cpu_matrix, cpu_A, "\t shared memory implementation --->");
+  printf("\n\t GPU time  %lg [s]\n", (time / LOOP));
+  //////////////////////////////////////////////////////////////////////////////////////////
+
+  // free CPU-GPU memory
+  free(cpu_matrix);
+  free(cpu_A);
+  cudaFree(gpu_A);
+  cudaFree(gpu_matrix);
+  
+  return EXIT_SUCCESS;
+}
diff --git a/cuda-omp/cuda/miscellaneous/coalescing.cu b/cuda-omp/cuda/miscellaneous/coalescing.cu
new file mode 100644
index 0000000000000000000000000000000000000000..94b2e9e1b7f1e3b679ae103cd26e7cf30de1ff10
--- /dev/null
+++ b/cuda-omp/cuda/miscellaneous/coalescing.cu
@@ -0,0 +1,148 @@
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Memory coalescing is a technique which allows optimal usage of the global memory bandwidth.
+// That is, when parallel threads running the same instruction access to consecutive locations
+// in the global memory, the most favorable access pattern is achieved.
+//
+// Assigment : write a kernel to vary the degree of coalescing from 1 to 32 based on input 
+//             argument
+//
+// Hint: The Degree of Coalescing (DoC) is the inverse (33 -) the number of memory transactions
+//       required for a warp to execute an instruction.
+// Example:
+//          - buffer[threadIdx.x] has DoC of 32;
+//          - buffer[rand()]      has DoC of 1 in the worst case.
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 12.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc++ coalescing.cu -o coalescing
+// - Run the code:
+//   $ ./coalescing
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdlib.h>
+#include <cuda.h>
+#include <stdio.h>
+
+typedef double MyData;
+#define N      1024
+#define WARP   32
+#define BLOCK  1024
+
+__global__ void GPUDoC(const MyData *const __restrict__ source,
+		             MyData *const __restrict__ sink,
+		       const size_t                     DoC)
+{
+  const size_t myID = (threadIdx.x + (blockIdx.x * blockDim.x));
+  
+  if (myID >= WARP)
+    return;
+  
+  // load data using a given DoC
+  const size_t stride = ((WARP + 1) - DoC);
+
+  // stride 1 (DoC=32) means 32 threads accessing one block of contiguous 32 words, so one memory transaction;
+  // stride 2 (DoC=31) means 32 threads accessing 32 words spread across a lenght of size 64, so 2 memory transactions;
+  // stride 3 (DoC=30) means 32 threads accessing 32 words spread across a lenght of size 96, so 3 memory transactions;
+  // stride 4 (DoC=29) means 32 threads accessing 32 words spread across a lenght of size 128, so 4 memory transactions;
+  // ...
+  // stride 32 (DoC=1) means 32 threads accessing 32 words spread across a lenght of size 1024, so 32 memory transactions. 
+  
+  sink[myID] = source[myID * stride];
+  
+  return;
+}
+
+void CPU_GPU_memory_copy(const cudaError error)
+{
+  if (error)
+    {
+      printf("\n\t cudaMemcpy fails! ... aborting ...");
+      exit(EXIT_FAILURE);
+    }
+  
+  return;
+}
+
+void GPU_alloc_check(const cudaError error)
+{
+  if (error)
+    {
+      printf("\n\t cudaMalloc fails! ... aborting ... \n");
+      exit(EXIT_FAILURE);
+    }
+  
+  return;
+}
+
+void GPU_DoC(const MyData *const points,
+	     const size_t        DoC)
+{
+  //****************** GPU memory allocation ***************************************************//
+  MyData *source=NULL, *sink=NULL;
+  GPU_alloc_check(cudaMalloc((void **)&source, ((N + WARP) * sizeof(MyData))));
+  sink = source + N;
+  //********************************************************************************************//
+
+  //***************** Host to GPU memory copy **************************************************//
+  CPU_GPU_memory_copy(cudaMemcpy(source, points, (N * sizeof(MyData)),
+				 cudaMemcpyHostToDevice));
+  //********************************************************************************************//
+
+  //***************** Kernel lunch *************************************************************//
+  // kernel is run using WARP threads
+  const size_t nblocks = ((WARP + BLOCK - 1) / BLOCK);
+
+  printf("\n\t DoC: %ld", DoC);
+  printf("\n\t GPU_DoC<<< %ld, %ld >>>(source, sink, %ld)\n\n", nblocks, BLOCK, DoC);
+  
+  GPUDoC<<< nblocks, BLOCK >>>(source, sink, DoC);
+
+  // GPU synchronization
+  cudaDeviceSynchronize();
+
+  // Free GPU memory
+  cudaFree(source);
+
+  //********************************************************************************************//
+  
+  return;
+}
+
+int main(int argv, char *argc[])
+{
+  size_t DoC = 0;
+  
+  if (argv < 2)
+    {
+      printf("\n\t Usage: ./coalescing <degree of coalescing>\n\n");
+      exit(EXIT_FAILURE);
+    }
+  else
+    {
+      DoC = atoi(argc[1]);
+
+      if ((DoC < 1) || (DoC > 32))
+	{
+	  printf("\n\t 1 <= <degree of coalescing> <= 32 ... aborting ... \n\n");
+	  exit(EXIT_FAILURE);
+	}      
+    }
+
+  srand48(time(NULL));
+  MyData *points = (MyData *)malloc(N * sizeof(*points));
+  for (size_t i=0 ; i<N ; i++)
+    points[i] = drand48();
+  
+  // lunch kernel
+  GPU_DoC(points, DoC);
+
+  free(points);
+  
+  return EXIT_SUCCESS;
+}
diff --git a/cuda-omp/omp/1/classwork_1.c b/cuda-omp/omp/1/classwork_1.c
index 6ff9cdffa4c67879983bdccb494e89f6e57ca086..f9ece471b3a05ce901ecfae7bf029e5dbe21fdc5 100644
--- a/cuda-omp/omp/1/classwork_1.c
+++ b/cuda-omp/omp/1/classwork_1.c
@@ -2,7 +2,7 @@
 //
 // OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70'
 //
-// Assigment : write a OMP-GPU code corresponding to the
+// Assigment : write an OMP-GPU code corresponding to the
 // following sequential C code
 //
 // #include <stdio.h>
@@ -20,9 +20,10 @@
 // Author: David Goz
 // mail  : david.goz@inaf.it
 // date  : 06.07.2024
+// code tested using nvhpc
 //
 // - Compile the code to run on :
-//   $ nvcc classwork_1.c -o classwork_1_omp
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork_1.c -o classwork_1_omp
 // - Run the code:
 //   $ ./classwork_1_omp
 // - Check the result:
@@ -36,27 +37,73 @@
 #define N        100
 #define NThreads 1024
 
+#define NDEBUG
+
 void GPUkernelSerial(const int size)
 {
 #pragma omp target
   {
+#if !defined(NDEBUG)
+
     if (!omp_is_initial_device())
       printf("\n\t GPU is executing GPUkernelSerial\n" );
     else
       printf("\n\t CPU is executing GPUkernelSerial\n" );
+
+#endif /* NDEBUG */
+    
+    const int whoAmI = omp_get_thread_num();
+    
+    for (int i=0 ; i<size ; i++)
+      printf("Hello from OMP-GPU thread: %d - result %d\n", whoAmI, (i * i));
+  } /* omp target region - implicit barrier */
+
+  return;
+}
+
+void GPUkernelParallel(const int size)
+{
+#pragma omp target
+  {
+#if !defined(NDEBUG)
     
+    if (!omp_is_initial_device())
+      printf("\n\t GPU is executing GPUkernelSerial\n" );
+    else
+      printf("\n\t CPU is executing GPUkernelSerial\n" );
+
+#endif /* NDEBUG */
+
+    #pragma omp teams distribute parallel for
     for (int i=0 ; i<size ; i++)
-      printf("Hello from OMP-GPU thread: %d - result %d\n", i, (i * i));
-  }
+      {
+	/* get CUDA blockIdx.x */
+	const int team = omp_get_team_num();
+
+	/* get CUDA threadIdx.x */
+	const int tid = omp_get_thread_num();
+
+	/* get CUDA blockDim.x */
+	const int nthr = omp_get_num_threads();
+	
+	const int whoAmI = tid + (team * nthr);
+	
+	printf("Hello from OMP-GPU thread: %d - result %d\n", whoAmI, (i * i));
+      }
+  } /* omp target region - implicit barrier */
+
   return;
 }
 
 int main()
 {
-  printf("\n\t The host issues the kernel on the GPU \n");
-  
+  printf("\n\t The host issues the kernel on the GPU in serial\n");
   /* kernel lunch using one GPU thread */
   GPUkernelSerial(N);
 
+  printf("\n\t The host issues the kernel on the GPU in parallel\n");
+  /* kernel lunch using N GPU threads */
+  GPUkernelParallel(N);
+  
   return 0;
 }
diff --git a/cuda-omp/omp/1/classwork_2.c b/cuda-omp/omp/1/classwork_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..9842ebfe062b9fa10e622606a02fdcdd62fa2b86
--- /dev/null
+++ b/cuda-omp/omp/1/classwork_2.c
@@ -0,0 +1,112 @@
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70'
+//
+// Assigment : write an OMP-GPU code corresponding to the
+// following sequential C code
+//
+// #include <stdio.h>
+// #define N 100
+// int main()
+// {
+//   int A[N];
+//
+//   for (int i=0 ; i<N ; i++)
+//     A[i] = (i * i);
+//
+//   return 0;
+// }
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 06.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork_2.c -o classwork_2_omp
+// - Run the code:
+//   $ ./classwork_2_omp
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <omp.h>
+
+#define N        100
+#define NDEBUG
+
+void GPUkernel(      int *A,
+	       const int  size)
+{
+  /* map A to the address space of the accelerator */
+#pragma omp target data map(from: A[0:size])
+  {
+    /* kernel to be executed on the accelerator */
+   #pragma omp target
+    {
+      /* create a bunch of teams (CUDA blocks) */
+      #pragma omp teams
+      {
+	/* distribute the teams over index iterations and */
+	/* spawn a bunch of threads within each team */
+	#pragma omp distribute parallel for
+	for (int i=0 ; i<N ; i++)
+	  {
+#if !defined(NDEBUG)
+
+	    const int team  = omp_get_team_num();
+	    const int nteam = omp_get_num_teams();
+	    const int tid   = omp_get_thread_num();
+	    const int nthr  = omp_get_num_threads();
+
+	    const int whoAmI = tid + (team * nthr);
+	    
+	    if (!omp_is_initial_device())
+	      {
+		if (whoAmI == 0)
+		  {
+		    printf("\n\t GPU is executing GPUkernel\n" );
+		    printf("\n\t team : %d",   team);
+		    printf("\n\t nteam: %d",   nteam);
+		    printf("\n\t tid  : %d",   tid);
+		    printf("\n\t nthr : %d\n", nthr);
+		  }
+	      }
+	    else
+	      printf("\n\t CPU is executing GPUkernel\n" );
+      
+#endif /* NDEBUG */	    
+	    
+	    A[i] = (i * i);
+	  } /* omp parallel for */
+      } /* omp teams */
+    } /* omp target */
+  } /* omp target data */
+    
+  return;
+}
+
+int main()
+{
+  /* host array */
+  int *A = (int *)malloc(N * sizeof(*A));
+  assert(A != NULL);
+  
+  // kernel lunch
+  GPUkernel(A, N);
+  
+  // check the result
+  printf("\n");
+  for (size_t i=0 ; i<N ; i++)
+    printf("\t A[%d] = %d", i, A[i]);
+  printf("\n\n");
+
+  // free host memory
+  free(A);
+  
+  return 0;
+}
diff --git a/cuda-omp/omp/2/classwork.c b/cuda-omp/omp/2/classwork.c
new file mode 100644
index 0000000000000000000000000000000000000000..e568fee02df39a6e4c9938f3e419c10421381397
--- /dev/null
+++ b/cuda-omp/omp/2/classwork.c
@@ -0,0 +1,114 @@
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70'
+//
+// Assigment : write a OMP-GPU code corresponding to the
+// following sequential C code
+//
+// #include <stdio.h>
+// #define ROW 4
+// #define COL 8
+// int main()
+// {
+//   int Matrix[ROW * COL];
+//
+//   for (int row=0 ; row<ROW ; row++)
+//      for (int col=0 ; col<COL ; col++)
+//         Matrix[(row * COL) + col] = ((row * COL) + col);
+//
+//   return 0;
+// }
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 08.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
+// - Run the code:
+//   $ ./classwork_omp
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <omp.h>
+
+#define BLOCKSIZE 32
+#define ROW       1089
+#define COL       1111
+
+#define NDEBUG
+
+typedef int MyData;
+
+void GPUMatrix(MyData *Matrix,
+	       const int dim)
+{
+  /* exploit synchronously the GPU                                                             */
+  /* allocate a buffer of size 'dim' and copy it to host memory at the end of kernel execution */
+  /* create a given number of teams and distribute them across iterations                      */
+  /* create a given number of threads per each team                                            */
+
+#pragma omp target                                                                \
+            teams distribute num_teams(((ROW * COL) + BLOCKSIZE - 1) / BLOCKSIZE) \
+	    parallel for num_threads(BLOCKSIZE)                                   \
+	    map(from: Matrix[0:dim])
+  for (size_t index=0 ; index<dim ; index++)
+    {
+#if !defined(NDEBUG)
+      
+      const int nteam  = omp_get_num_teams();
+      const int team   = omp_get_team_num();
+      const int tid    = omp_get_thread_num();
+      const int nthr   = omp_get_num_threads();
+      const int whoAmI = tid + (team * nthr);
+
+      if (!omp_is_initial_device() && !whoAmI)
+	{
+	  printf("\n\t GPU is running:");
+	  printf("\n\t\t Number of teams: %d - Number of threads per team: %d\n\n", nteam, nthr);
+	}
+      
+#endif /* NDEBUG */
+
+      Matrix[index] = index;
+    } /* end-target region */
+  
+  return;
+}
+
+int main()
+{
+  /* host matrix . 1D mapping matrix[i][j] ---> matrix[(i * COL) + j] */
+  MyData *matrix = (MyData *)malloc(ROW * COL * sizeof(*matrix));
+  assert(matrix != NULL);
+  
+  // kernel
+  GPUMatrix(matrix, (ROW * COL));
+  
+  // check the result
+  int flag = 0;
+  for (size_t row=0 ; row<ROW ; row++)
+    {
+      const size_t i = (row * COL);
+      
+      for (size_t col=0 ; col<COL ; col++)
+	{
+	  flag = ((matrix[i + col] != (i + col)) ? -1 : flag);
+	} /* col loop */
+    } /* row loop */
+
+  // free host memory
+  free(matrix);
+
+  if (flag)
+    printf("\n\t Result wrong! \n\n");
+  else
+    printf("\n\t Result OK! \n\n");
+  
+  return 0;
+}
diff --git a/cuda-omp/omp/3/classwork.c b/cuda-omp/omp/3/classwork.c
new file mode 100644
index 0000000000000000000000000000000000000000..6f069f560cde7688f54655edccae6a3da81ddf25
--- /dev/null
+++ b/cuda-omp/omp/3/classwork.c
@@ -0,0 +1,131 @@
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70'
+//
+// Assigment : write a OMP-GPU code that does:
+//             - each thread generates a pair of points as (x, y) coordinates randomly distributed;
+//             - each thread computes the euclidean distance d = sqrt((x1 - x2)^2 + (y1 - y2)^2);
+//             - the kernel prints the maximum distance.
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 06.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp -lm
+// - Run the code:
+//   $ ./classwork_omp
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <omp.h>
+#include <time.h>
+#include <assert.h>
+
+#define NUMPOINTS  1024
+#define SQUARE(A)      ((A) * (A))
+
+#define NDEBUG
+
+typedef double MyData;
+
+typedef struct PairPoints
+{
+  MyData x[2];
+  MyData y[2];
+} PairPoints;
+
+void GPUMaxDistance(const PairPoints *const points,
+		    const int               size)
+{
+  /* exploit synchronously the GPU                                               */
+  /* - 'distance' must be mapped on the GPU (i.e. allocated on the global memory */
+  /*   of the GPU), otherwise is firstprivate by default (i.e. private to each   */
+  /*   thread) and the host cannot access thread's private memory.               */
+
+  MyData distance;
+
+#pragma omp target                                  \
+        teams distribute                            \ 
+        parallel for reduction(max: distance)	    \
+        map(to: points[0:size]) map(from: distance)
+  for (size_t i=0 ; i<size ; i++)
+    {
+#if !defined(NDEBUG)
+      
+      const int nteam  = omp_get_num_teams();
+      const int team   = omp_get_team_num();
+      const int tid    = omp_get_thread_num();
+      const int nthr   = omp_get_num_threads();
+      const int whoAmI = tid + (team * nthr);
+
+      if (!omp_is_initial_device() && !whoAmI)
+	{
+	  printf("\n\t GPU is running:");
+	  printf("\n\t\t Number of teams: %d - Number of threads per team: %d\n\n", nteam, nthr);
+	}
+      
+#endif /* NDEBUG */
+      
+      const PairPoints myPoint      = points[i];
+      const MyData pair_distance_X2 = SQUARE(myPoint.x[0] - myPoint.x[1]);
+      const MyData pair_distance_Y2 = SQUARE(myPoint.y[0] - myPoint.y[1]);
+      const MyData pair_distance    = sqrt(pair_distance_X2 + pair_distance_Y2);
+
+      distance = ((distance < pair_distance) ? pair_distance : distance);
+    }
+
+  printf("\n\t GPU maximum distance: %lg\n", distance);
+  
+  return;
+}
+
+void CPUMaxDistance(const PairPoints *const points,
+		    const        int        size)
+{
+  MyData distance = -1.0;
+  for (size_t i=0 ; i<size ; i++)
+    {
+      const MyData pair_distance_X2 = SQUARE(points[i].x[0] - points[i].x[1]);
+      const MyData pair_distance_Y2 = SQUARE(points[i].y[0] - points[i].y[1]);
+      const MyData pair_distance    = sqrt(pair_distance_X2 + pair_distance_Y2);
+
+      distance = ((distance < pair_distance) ? pair_distance : distance);
+    }
+
+  printf("\n\t CPU maximum distance: %lg\n", distance);
+  
+  return;
+}
+
+int main()
+{
+  /* host allocation */
+  PairPoints *points = (PairPoints *)malloc(NUMPOINTS * sizeof(*points));
+  assert(points != NULL);
+
+  /* initialization */
+  srand48(time(NULL));
+  for(size_t i=0 ; i<NUMPOINTS ; i++)
+    {
+      points[i].x[0] = drand48();
+      points[i].x[1] = drand48();
+      points[i].y[0] = drand48();
+      points[i].y[1] = drand48();
+    }
+
+  // lunch the kernel synchrounusly
+  GPUMaxDistance(points, NUMPOINTS);
+
+  // check the result on the host
+  CPUMaxDistance(points, NUMPOINTS);
+  
+  free(points);
+  
+  return 0;
+}
diff --git a/cuda-omp/omp/3/classwork_async.c b/cuda-omp/omp/3/classwork_async.c
new file mode 100644
index 0000000000000000000000000000000000000000..61c5f4e17f1a439687f5c7ef9d4d08c92bad2ca2
--- /dev/null
+++ b/cuda-omp/omp/3/classwork_async.c
@@ -0,0 +1,145 @@
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70'
+//
+// Assigment : write a OMP-GPU code that does:
+//             - each thread generates a pair of points as (x, y) coordinates randomly distributed;
+//             - each thread computes the euclidean distance d = sqrt((x1 - x2)^2 + (y1 - y2)^2);
+//             - the kernel prints the maximum distance;
+//             - host manages asynchronously the GPU.
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 06.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork_async.c -o classwork_async_omp -lm
+// - Run the code:
+//   $ ./classwork_async_omp
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <omp.h>
+#include <time.h>
+#include <assert.h>
+
+#define NUMPOINTS  65536
+#define SQUARE(A)  ((A) * (A))
+
+#define NDEBUG
+
+typedef double MyData;
+
+typedef struct PairPoints
+{
+  MyData x[2];
+  MyData y[2];
+} PairPoints;
+
+/* global host-device variables (i.e. on both global memories) */
+/* a copy of 'GPUMaxDistance' is created in the global memory  */
+/* of the GPU from the beginning of the program till its end.  */
+#pragma omp declare target
+MyData DeviceMaxDistance;
+#pragma omp end declare target
+
+void GPUMaxDistance(const PairPoints *const points,
+		    const int               size)
+{
+#pragma omp target nowait                              \
+        teams distribute                               \ 
+        parallel for reduction(max: DeviceMaxDistance) \
+        map(to: points[0:size])
+  for (size_t i=0 ; i<size ; i++)
+    {
+#if !defined(NDEBUG)
+      
+      const int nteam  = omp_get_num_teams();
+      const int team   = omp_get_team_num();
+      const int tid    = omp_get_thread_num();
+      const int nthr   = omp_get_num_threads();
+      const int whoAmI = tid + (team * nthr);
+
+      if (!omp_is_initial_device() && !whoAmI)
+	{
+	  printf("\n\t GPU is running:");
+	  printf("\n\t\t Number of teams: %d - Number of threads per team: %d\n\n", nteam, nthr);
+	}
+      
+#endif /* NDEBUG */
+      
+      const PairPoints myPoint      = points[i];
+      const MyData pair_distance_X2 = SQUARE(myPoint.x[0] - myPoint.x[1]);
+      const MyData pair_distance_Y2 = SQUARE(myPoint.y[0] - myPoint.y[1]);
+      const MyData pair_distance    = sqrt(pair_distance_X2 + pair_distance_Y2);
+
+      DeviceMaxDistance = ((DeviceMaxDistance < pair_distance) ? pair_distance : DeviceMaxDistance);
+    } /* end-target region */
+
+  /* data synchronization host-device, the host fetches data from the GPU */
+  /* By default, the host must wait for the completion of the fetching,   */
+  /* however the 'nowait' clause enables the target update construct to   */
+  /* execute asynchronously with respect to the encountering host thread. */
+  #pragma omp target update from(DeviceMaxDistance) nowait
+	
+  return;
+}
+
+void CPUMaxDistance(const PairPoints *const points,
+		    const        int        size)
+{
+  MyData distance = -1.0;
+  for (size_t i=0 ; i<size ; i++)
+    {
+      const MyData pair_distance_X2 = SQUARE(points[i].x[0] - points[i].x[1]);
+      const MyData pair_distance_Y2 = SQUARE(points[i].y[0] - points[i].y[1]);
+      const MyData pair_distance    = sqrt(pair_distance_X2 + pair_distance_Y2);
+
+      distance = ((distance < pair_distance) ? pair_distance : distance);
+    }
+
+  printf("\n\t CPU maximum distance: %lg\n", distance);
+  
+  return;
+}
+
+int main()
+{
+  /* host allocation */
+  PairPoints *points = (PairPoints *)malloc(NUMPOINTS * sizeof(*points));
+  assert(points != NULL);
+
+  /* initialization */
+  srand48(time(NULL));
+  for(size_t i=0 ; i<NUMPOINTS ; i++)
+    {
+      points[i].x[0] = drand48();
+      points[i].x[1] = drand48();
+      points[i].y[0] = drand48();
+      points[i].y[1] = drand48();
+    }
+  
+  /* lunch the kernel asynchrounusly (i.e. the host */
+  /* does not wait the completion of GPU execution) */
+  GPUMaxDistance(points, NUMPOINTS);
+
+  // check the result on the host
+  CPUMaxDistance(points, NUMPOINTS);
+
+  /* host-device synchronization in order to ensure that the */
+  /* kernel has been executed on the GPU and the global      */
+  /* variable 'DeviceMaxDistance' is synchronized between    */
+  /* host and device data environment.                       */
+  #pragma omp taskwait
+  
+  printf("\n\t GPU maximum distance: %lg\n", DeviceMaxDistance);
+  
+  free(points);
+  
+  return 0;
+}
diff --git a/cuda-omp/omp/4/classwork.c b/cuda-omp/omp/4/classwork.c
new file mode 100644
index 0000000000000000000000000000000000000000..6fc63e79f3e28a0d6c72bbe492ea69b35cbcde7a
--- /dev/null
+++ b/cuda-omp/omp/4/classwork.c
@@ -0,0 +1,183 @@
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70'
+//
+// Assigment : write a OMP-GPU code corresponding to the
+// following sequential C code
+//
+// #include <stdio.h>
+// int main()
+// {
+//   /* Matrix multiplication: C = A X B */
+//
+//   int A[N][N], B[N][N], C[N][N];
+//
+//   for (int ii=0 ; ii<N ; ii++)
+//      for (int jj=0 ; jj<N ; jj++)
+//         for (int kk=0 ; kk<N ; kk++)
+//            C[ii][jj] += A[ii][kk] * B[kk][jj];
+//
+//   return 0;
+// }
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 08.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
+// - Run the code:
+//   $ ./classwork_omp <N>
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+#include <assert.h>
+//#define NDEBUG
+
+#define BLOCKSIZE 32
+#define SQUARE(SIZE) ((SIZE) * (SIZE))
+
+typedef long int MyData;
+
+void matrix_init(      MyData *const AB,
+		 const MyData        N)
+{
+  MyData *const restrict A = AB;
+  MyData *const restrict B = AB + SQUARE(N);
+  
+  for (MyData ii=0 ; ii<N ; ii++)
+    for (MyData jj=0 ; jj<N ; jj++)
+      {
+	A[(ii * N) + jj] = (ii - jj);
+	B[(ii * N) + jj] = 1;
+      }
+  
+  return;
+}
+
+void GPU_MM(const MyData *const restrict A,
+	    const MyData *const restrict B,
+	          MyData *const restrict C,
+	    const MyData                 N)
+{
+  /* spawnig N^2 threads */
+  /* mapping is not specify because we rely on 'omp target enter/exit data construct' */
+
+#pragma omp target                                                            \
+            teams distribute num_teams((SQUARE(N) + BLOCKSIZE - 1)/BLOCKSIZE) \
+            parallel for collapse(2) num_threads(BLOCKSIZE)
+  for (MyData ii=0 ; ii<N ; ii++)
+    { 
+      for (MyData jj=0 ; jj<N ; jj++)
+	{
+#if !defined(NDEBUG)
+	  
+	  /* ID within the team (CUDA block) */
+	  const MyData localID  = omp_get_thread_num();
+	  /* Team ID (ID CUDA block) */
+	  const MyData team     = omp_get_team_num();
+	  /* Team size (CUDA block size) */
+	  const MyData nthr     = omp_get_num_threads();
+	  /* global thread index */
+	  const MyData globalID = (localID + (team * nthr));
+
+	  printf("\n\t globalID: %ld - ii: %ld - jj: %ld\n", globalID, ii, jj);
+
+#endif /* NDEBUG */	  
+	  
+	  MyData sum = 0;
+	  for (MyData kk=0 ; kk<N ; kk++)
+	    {
+	      sum += (A[(ii * N) + kk] * B[(kk * N) + jj]);
+	    } /* kk loop */
+	  C[(ii * N) + jj] = sum;
+	} /* jj loop */
+    } /* ii loop */
+
+  return;
+}
+
+void check(const MyData *const __restrict__ GPU_MM,
+	   const MyData *const __restrict__ A,
+	   const MyData                     N)
+{
+  int flag = 0;
+
+  for (MyData ii=0 ; ii<N ; ii++)
+    {
+      MyData row_a = 0;      
+      for (MyData kk=0 ; kk<N ; kk++)
+	row_a += A[(ii * N) + kk];
+
+      for (MyData jj=0 ; jj<N ; jj++)
+	if (GPU_MM[(ii * N) + jj] != row_a)
+	  {
+	    flag = 1;
+	    break;
+	  }
+    }
+
+  if (flag)
+    printf("\n\t Result WRONG \n\n");
+  else
+    printf("\n\t Result OK \n\n");
+  
+  return;
+}
+
+int main(int arg, char *argv[])
+{
+  MyData N;
+  
+  if (arg < 2)
+    {
+      printf("\n\t Usage: $ ./classwork <number_of_matrix_elements> \n");
+      exit(EXIT_FAILURE);
+    }
+  else
+    {
+      N = atoi(argv[1]);
+      if (N <= 0)
+	{
+	  printf("\n\t Number of matrix elements must be greater than zero \n");
+	  exit(EXIT_FAILURE);
+	}
+      else
+	{
+	  printf("\n\t Matrix size    : %d x %d", N, N);
+	}
+    }
+
+  /* host memory allocation */
+  MyData *buffer = (MyData *)malloc(3 * SQUARE(N) * sizeof(MyData));
+  assert(buffer != NULL);
+  /* set-up host pointers */
+  MyData *const restrict A = buffer;
+  MyData *const restrict B = A + SQUARE(N);
+  MyData *const restrict C = B + SQUARE(N);
+  
+  // matrix initialization
+  matrix_init(buffer, N);
+
+  /* device memory allocation and copy */
+#pragma omp target enter data map(to   : A[0:SQUARE(N)]) \
+                              map(to   : B[0:SQUARE(N)]) \
+                              map(alloc: C[0:SQUARE(N)])
+
+  GPU_MM(A, B, C, N);
+
+  /* host-device data synchronization */
+#pragma omp target exit data map(from: C[0:SQUARE(N)])
+
+  // check the result
+  check(C, A, N);
+
+  free(buffer);
+  
+  return 0;
+}
diff --git a/cuda-omp/omp/6/classwork.c b/cuda-omp/omp/6/classwork.c
new file mode 100644
index 0000000000000000000000000000000000000000..a7605b6813feb2a1e909bef627c03f58b85aeb30
--- /dev/null
+++ b/cuda-omp/omp/6/classwork.c
@@ -0,0 +1,325 @@
+////////////////////////////////////////////////////////////////////////////////////////////////
+// - You are given 512 x 512 integer matrix M;
+// - Each row is assigned to a thread-block;
+// - Each thread is assigned a matrix element M[i][j];
+// - It changes M[i][j] to M[i][j] + M[i][j+1] (where possible);
+// - Exploit shared-memory.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 14.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
+// - Run the code:
+//   $ ./classwork_omp
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <assert.h>
+#include <omp.h>
+#include <string.h>
+
+#define N                     512
+#define SIZE                  (N * N) // matrix size
+typedef int64_t MyData;               // do not change
+#define BLOCKSIZE             N       // number of threads per block
+
+// sanity check
+#if BLOCKSIZE > 1024
+#error BLOCKSIZE cannot be larger than 1024
+#endif
+
+#if BLOCKSIZE != N
+#error BLOCKSIZE must be equal to N
+#endif
+
+#define LOOP 100
+#define NDEBUG
+
+double wall_time()
+{
+  struct timespec ts;
+  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
+
+  return ret;
+}
+
+void CPU_matrix(MyData *const matrix)
+{
+  for (size_t i=0 ; i<N ; i++)
+    for (size_t j=0 ; j<N-1 ; j++)
+      matrix[(i * N) + j] += matrix[(i * N) + j + 1];
+
+  return;
+}
+
+void GPU_matrix_loops(      MyData *const restrict matrix,
+		      const MyData *const restrict buffer)
+{
+#pragma omp target teams distribute num_teams(N)
+  for (size_t i=0 ; i<N ; i++)
+    {
+      #pragma omp parallel for num_threads(N)
+      for (size_t j=0 ; j<N ; j++)
+	{
+#if !defined(NDEBUG)
+
+	  const int nteam  = omp_get_num_teams();
+	  const int team   = omp_get_team_num();
+	  const int tid    = omp_get_thread_num();
+	  const int nthr   = omp_get_num_threads();
+	  const int whoAmI = tid + (team * nthr);
+	    
+	  if (!omp_is_initial_device() && !tid)
+	    printf("\n\t\t Team: %d - nthr: %d - whoAmI: %d - i: %ld - j: %ld\n\n",
+		   team, nthr, whoAmI, i, j);
+
+#endif /* NDEBUG */
+	   
+	  matrix[(i * N) + j] = ((j < N-1)                                       ?
+				 (buffer[(i * N) + j] + buffer[(i * N) + j + 1]) :
+				 buffer[(i * N) + j]);
+	} /* j loop distributed across threads within a team */
+    } /* i loop distributed across teams */
+  
+  return;
+}
+
+void GPU_matrix_no_loops(      MyData *const restrict matrix,
+			 const MyData *const restrict buffer)
+{
+ #pragma omp target
+  {
+   #pragma omp teams num_teams(N)
+    {
+      const size_t team = omp_get_team_num();
+
+     #pragma omp parallel firstprivate(matrix, buffer, team) num_threads(N)
+      {
+	const size_t tid      = omp_get_thread_num();
+	const size_t globalID = ((team * N) + tid);
+
+	matrix[globalID] = ((tid < (N - 1))                           ?
+			    (buffer[globalID] + buffer[globalID + 1]) :
+			    buffer[globalID]);
+      } /* omp parallel */
+    } /* omp teams */
+  } /* omp target */
+
+  return;
+}
+
+void GPU_matrix_shared_loops(MyData *const restrict matrix)
+{
+  #pragma omp target teams distribute num_teams(N)
+  for (size_t i=0 ; i<N ; i++)
+    {
+      MyData buffer[BLOCKSIZE + 1];
+      
+      #pragma omp parallel for shared(i, buffer) num_threads(N)
+      for (size_t j=0 ; j<N ; j++)
+	{
+#if !defined(NDEBUG)
+
+	  const int nteam  = omp_get_num_teams();
+	  const int team   = omp_get_team_num();
+	  const int tid    = omp_get_thread_num();
+	  const int nthr   = omp_get_num_threads();
+	  const int whoAmI = tid + (team * nthr);
+	    
+	  if (!omp_is_initial_device() && !tid)
+	    printf("\n\t\t Team: %d - nthr: %d - whoAmI: %d - i: %ld - j: %ld\n\n",
+		   team, nthr, whoAmI, i, j);
+
+#endif /* NDEBUG */
+
+	  /* shared memory initialization */
+	  for (size_t item=j ; item<(BLOCKSIZE + 1) ; item+=BLOCKSIZE)
+	    buffer[item] = (MyData)0;
+
+	  /* load data into shared memory */
+	  const size_t globalID = ((i * N) + j);
+
+	  buffer[j] = matrix[globalID];
+	} /* implicit thread-block synchronization */
+
+      #pragma omp parallel for shared(i , buffer) num_threads(N)
+      for (size_t j=0 ; j<N ; j++)
+	{
+	  const size_t globalID = ((i * N) + j);
+	  
+	  matrix[globalID] = (buffer[j] + buffer[j + 1]);
+	} /* implicit thread-block synchronization */
+    } /* i loop distributed across teams */
+  
+  return;
+}
+
+void GPU_matrix_shared_no_loops(MyData *const restrict matrix)
+{
+  #pragma omp target
+  {
+    #pragma omp teams num_teams(N)
+    {
+      MyData buffer[BLOCKSIZE + 1];
+      
+      const size_t team_N = (N * omp_get_team_num());
+
+      #pragma omp parallel shared(buffer) firstprivate(team_N, matrix) num_threads(N)
+      {
+	const size_t tid = omp_get_thread_num();
+	
+	for (size_t item=tid ; item<(BLOCKSIZE + 1) ; item+=BLOCKSIZE)
+	  buffer[item] = (MyData)0;
+
+	buffer[tid] = matrix[team_N + tid];
+      }
+
+      #pragma omp parallel shared(buffer) firstprivate(team_N, matrix) num_threads(N)
+      {
+	const size_t tid = omp_get_thread_num();
+
+	matrix[team_N + tid] = (buffer[tid] + buffer[tid + 1]);
+      }
+    } /* omp teams */
+  } /* omp target */
+
+  return;
+}
+
+void check(const MyData *const __restrict__ cpu_matrix,
+	   const MyData *const __restrict__ gpu_matrix,
+	   const char   *const __restrict__ msg)
+{
+  int flag;
+  for (size_t i=0 ; i<SIZE ; i++)
+    flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);
+
+  if (!flag)
+    printf("\n\t Result OK %s", msg);
+  else
+    printf("\n\t Result wrong %s", msg);
+  
+  return;
+}
+
+int main()
+{
+  double time;
+  // host reference matrix A
+  MyData *A = (MyData *)malloc(SIZE * sizeof(MyData));
+  assert (A != NULL);
+  for (size_t i=0 ; i<SIZE ; i++)
+    A[i] = (lrand48() % SIZE);
+
+  ////////////////////////// CPU MATRIX //////////////////////////////////////////
+  // allocate a second matrix where to perfom the actual computation
+  MyData *matrix = (MyData *)malloc(SIZE * sizeof(MyData));
+  assert(matrix != NULL);
+
+  // perform the actual computation
+  time = 0.0;
+  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+    {
+      memcpy(matrix, A, (SIZE * sizeof(MyData)));
+      
+      const double start = wall_time();
+      CPU_matrix(matrix);
+      time += (wall_time() - start);
+    }
+  printf("\n\t CPU time %lg [s]\n", (time / LOOP));
+  ////////////////////////////////////////////////////////////////////////////////
+
+  ////////////////////////////// GPU MATRIX //////////////////////////////////////
+  MyData *gpu_matrix = (MyData *)malloc(SIZE * sizeof(MyData));
+  assert(gpu_matrix != NULL);  
+
+#pragma omp target data map(to: A[0:SIZE]) map(from: gpu_matrix[0:SIZE])
+  {
+    time = 0.0;
+    for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+      {      
+	double start = wall_time();
+	GPU_matrix_loops(gpu_matrix, A);
+	time += (wall_time() - start);
+      }
+  } /* omp target data */
+
+  check(matrix, gpu_matrix, "\t global memory implementation with loops --->");
+  printf("\n\t GPU time  %lg [s]\n", (time / LOOP));
+  //////////////////////////////////////////////////////////////////////////////////////////
+
+  ////////////////////////////// GPU MATRIX //////////////////////////////////////
+
+#pragma omp target data map(to: A[0:SIZE]) map(from: gpu_matrix[0:SIZE])
+  {
+    time = 0.0;
+    for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+      {      
+	double start = wall_time();
+	GPU_matrix_no_loops(gpu_matrix, A);
+	time += (wall_time() - start);
+      }
+  } /* omp target data */
+
+  check(matrix, gpu_matrix, "\t global memory implementation without loops --->");
+  printf("\n\t GPU time  %lg [s]\n", (time / LOOP));
+  //////////////////////////////////////////////////////////////////////////////////////////
+
+  ////////////////////////// GPU MATRIX shared memory //////////////////////////////////////
+  memcpy(gpu_matrix, A, (SIZE * sizeof(MyData)));
+  /* gpu_matrix is allocated on the GPU */
+  #pragma omp target enter data map(alloc: gpu_matrix[0:SIZE])
+    
+  time = 0.0;
+  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+    {
+      // restore data into gpu_matrix
+      #pragma omp target update to(gpu_matrix[0:SIZE])
+
+      double start = wall_time();
+      GPU_matrix_shared_loops(gpu_matrix);
+      time += (wall_time() - start);
+    }
+
+  #pragma omp target exit data map(from: gpu_matrix[0:SIZE])  
+  check(matrix, gpu_matrix, "\t shared memory implementation with loops --->");
+  printf("\n\t GPU time  %lg [s]\n", (time / LOOP));
+  //////////////////////////////////////////////////////////////////////////////////////////
+
+    ////////////////////////// GPU MATRIX shared memory //////////////////////////////////////
+  memcpy(gpu_matrix, A, (SIZE * sizeof(MyData)));
+  /* gpu_matrix is allocated on the GPU */
+  #pragma omp target enter data map(alloc: gpu_matrix[0:SIZE])
+    
+  time = 0.0;
+  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+    {
+      // restore data into gpu_matrix
+      #pragma omp target update to(gpu_matrix[0:SIZE])
+
+      double start = wall_time();
+      GPU_matrix_shared_no_loops(gpu_matrix);
+      time += (wall_time() - start);
+    }
+
+  #pragma omp target exit data map(from: gpu_matrix[0:SIZE])  
+  check(matrix, gpu_matrix, "\t shared memory implementation without loops --->");
+  printf("\n\t GPU time  %lg [s]\n", (time / LOOP));
+  //////////////////////////////////////////////////////////////////////////////////////////
+
+  // free CPU memory
+  free(gpu_matrix);
+  free(matrix);
+  free(A);
+  
+  return EXIT_SUCCESS;
+}
diff --git a/cuda-omp/omp/6/classwork_omp b/cuda-omp/omp/6/classwork_omp
new file mode 100755
index 0000000000000000000000000000000000000000..6766c7bdf174daf8fcd104c4bc67d02ba4885384
Binary files /dev/null and b/cuda-omp/omp/6/classwork_omp differ
diff --git a/cuda-omp/omp/miscellaneous/map_type_modifier.c b/cuda-omp/omp/miscellaneous/map_type_modifier.c
new file mode 100644
index 0000000000000000000000000000000000000000..65396c9473bf51f774493ae8246652f41ccdbe29
--- /dev/null
+++ b/cuda-omp/omp/miscellaneous/map_type_modifier.c
@@ -0,0 +1,69 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 29.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v map_type_modifier.c -o map_type_modifier_omp
+// - Run the code:
+//   $ export OMP_TARGET_OFFLOAD=mandatory
+//   $ ./map_type_modifier_omp
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <omp.h>
+
+#define N     8
+typedef float MyData;
+
+void allocate(      MyData **buffer,
+	      const size_t   size)
+{
+  /* allocate the buffer on the host memory */
+  MyData *tmp = malloc(size * sizeof(**buffer));
+  assert(tmp != NULL);
+  *buffer = tmp;
+
+#pragma omp target enter data map(alloc: tmp[0:size])
+  
+  return;
+}
+
+void print(const MyData *const A,
+	   const int           size)
+{
+  printf("\n");
+  for (size_t i=0 ; i<N ; i++)
+    printf("\n\t array[%i] = %lg", i, A[i]);
+  printf("\n\n");
+  
+  return;
+}
+
+int main()
+{
+  MyData *array = NULL;
+  allocate(&array, N);
+
+  /* init on the GPU */
+  #pragma omp target
+  {
+    for (size_t i=0 ; i<N ; i++)
+      array[i] = (MyData)(i);
+  }
+
+  #pragma omp target update from(array[0:N])
+  print(array, N);
+  
+  /* free the device's memory */
+  #pragma omp target exit data map(delete: array[0:N])
+
+  /* free the host's memory */
+  free(array);
+
+  return 0;
+}
diff --git a/cuda-omp/omp/miscellaneous/structure.c b/cuda-omp/omp/miscellaneous/structure.c
new file mode 100644
index 0000000000000000000000000000000000000000..c12b212f98c7a4d84a3c80829dcd2c305b47295e
--- /dev/null
+++ b/cuda-omp/omp/miscellaneous/structure.c
@@ -0,0 +1,82 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 31.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v structure.c -o structure_omp
+// - Run the code:
+//   $ export OMP_TARGET_OFFLOAD=mandatory
+//   $ ./structure_omp
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <omp.h>
+
+#define N  8
+typedef double MyData;
+
+typedef struct my_span
+{
+  size_t N;
+  MyData *ptr;
+} span;
+
+void allocate(      span  *my_struct,
+	      const size_t size)
+{
+  span tmp;
+  /* allocate the buffer on the host memory */
+  tmp.ptr = (MyData *)malloc(size * sizeof(MyData));
+  assert(tmp.ptr != NULL);
+  tmp.N = size;
+
+  /* declare how the object 'span' has to be mapped on the device */
+  #pragma omp declare mapper(span tmp) map(from: tmp, tmp.ptr[0: tmp.N])
+
+  my_struct->ptr = tmp.ptr;
+  my_struct->N   = tmp.N;
+  
+  return;
+}
+
+void print(const span *const A)
+{
+  printf("\n");
+  for (size_t i=0 ; i<A->N ; i++)
+    printf("\n\t array[%i] = %lg", i, A->ptr[i]);
+  printf("\n\n");
+  
+  return;
+}
+
+int main()
+{
+  span A, B;
+
+  allocate(&A, N);
+  allocate(&B, N);
+
+  /* init on the GPU */
+  #pragma omp target
+  {
+    for (size_t i=0 ; i<N ; i++)
+      {
+	A.ptr[i] = (MyData)(i);
+	B.ptr[i] = (MyData)(2 * i);
+      }
+  }
+
+  print(&A);
+  print(&B);
+  
+  /* free the host's memory */
+  free(A.ptr);
+  free(B.ptr);
+
+  return 0;
+}