diff --git a/cuda-omp/cuda/7/mat_mult_block.cu b/cuda-omp/cuda/7/mat_mult_block.cu
index 7808f021a87df0329a4784f72fd1966aa133e737..814f21c8f935f7e60198fad20af441cc465f1330 100644
--- a/cuda-omp/cuda/7/mat_mult_block.cu
+++ b/cuda-omp/cuda/7/mat_mult_block.cu
@@ -257,9 +257,9 @@ int main()
   const dim3 block   = {(BLOCK * BLOCK), 1, 1};
   
   /////////////////////////// GPU naive block algorithm ////////////////////////////////////////
-  time = 0.0;
   GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
   cudaDeviceSynchronize();
+  time = 0.0;
   for (unsigned short int loop=0 ; loop<LOOP ; loop++)
     {
       const double start = wall_time();
@@ -274,9 +274,9 @@ int main()
   /////////////////////////////////////////////////////////////////////////////////////
 
   /////////////////////////// GPU block shared memory algorithm ///////////////////////
-  time = 0.0;
   GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
   cudaDeviceSynchronize();
+  time = 0.0;
   for (unsigned short int loop=0 ; loop<LOOP ; loop++)
     {
       const double start = wall_time();
diff --git a/cuda-omp/hybrid/hybrid_cublas_omp.c b/cuda-omp/hybrid/hybrid_cublas_omp.c
new file mode 100644
index 0000000000000000000000000000000000000000..ab6aabe9a9984b5b0f30c2b5e7c91b94293f2336
--- /dev/null
+++ b/cuda-omp/hybrid/hybrid_cublas_omp.c
@@ -0,0 +1,282 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Using CUDA data in OpenMP
+//
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 03.09.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//      - using nvc
+//          $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
+//                 hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
+//      - using clang
+//          $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda 
+//                 hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
+//
+// - Run the code:
+//   $ export OMP_TARGET_OFFLOAD=mandatory
+//   $ ./hybrid_cuda_omp
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <assert.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <omp.h>
+
+#define N       2048
+#define SIZE    ((N) * (N))
+#define ALPHA   1.0
+#define BETA    0.0
+#define HOST    0
+#define DEV     1
+#define LOOP    10
+#define INIT    0
+#define KERNEL  1
+#define DATA    2
+
+typedef double MyData;
+
+static double _time[2][3];
+static int thr[2];
+
+double process_time()
+{
+  struct timespec ts;
+  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
+
+  return ret;
+}
+
+double thread_time()
+{
+  struct timespec ts;
+  clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
+
+  return ret;
+}
+
+void InitHost(MyData *const restrict A,
+	      MyData *const restrict B,
+	      MyData *const restrict C,
+	      int    *const restrict thr)
+{
+  double start;
+
+  #pragma omp parallel 
+  {
+    #pragma omp barrier
+    #pragma omp master
+    {
+      *thr = omp_get_num_threads();
+      start = thread_time();
+    }
+   #pragma omp barrier
+
+    #pragma omp for collapse(2)
+    for (int i=0 ; i<N ; i++)
+      for (int j=0 ; j<N ; j++)
+	{
+	  A[(i * N) + j] = 1.0;
+	  B[(i * N) + j] = 2.0;
+	  C[(i * N) + j] = 0.0;
+	}
+
+    #pragma omp master
+    {
+      _time[HOST][INIT] += (thread_time() - start); 
+    }
+  } // omp parallel
+
+  return;
+}
+
+void InitDev(MyData *const restrict A,
+	     MyData *const restrict B,
+	     MyData *const restrict C)
+{
+  const double start = thread_time();
+
+ #pragma omp target teams loop collapse(2)
+  for (int i=0 ; i<N ; i++)
+    for (int j=0 ; j<N ; j++)
+      {
+	A[(i * N) + j] = 1.0;
+	B[(i * N) + j] = 2.0;
+	C[(i * N) + j] = 0.0;
+      }
+
+  _time[DEV][INIT] += (thread_time() - start);
+
+  return;
+}
+
+void HostDgemm(MyData *const restrict A,
+	       MyData *const restrict B,
+	       MyData *const restrict C,
+	       const MyData           alpha,
+	       const MyData           beta,
+	       int    *const restrict thr)
+{
+  // C = alpha * A * B + beta * C;
+
+  double start;
+
+  // naive calculation
+ #pragma omp parallel
+  {
+    #pragma omp barrier
+    #pragma omp master
+    {
+      *thr = omp_get_num_threads();
+      start = thread_time();
+    }
+    #pragma omp barrier
+
+    #pragma omp for collapse(2)
+    for (int i=0 ; i<N ; i++)
+      for (int j=0 ; j<N ; j++)
+	{
+	  MyData sum = 0.0;
+	  for (int k=0 ; k<N ; k++)
+	    sum += A[(i * N) + k] * B[(k * N) + j];
+  
+	  C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]);
+	}
+
+    #pragma omp master
+    {
+      _time[HOST][KERNEL] += (thread_time() - start);
+    }
+  } // omp parallel
+
+  return;
+}
+
+void check(MyData *const restrict host_array,
+	   MyData *const restrict dev_array)
+{
+  int flag = 0;
+  for (size_t i=0 ; i<SIZE ; i++)
+    flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);
+
+  if (!flag)
+    printf("\n\t Result OK \n");
+  else
+    printf("\n\t Result wrong \n");
+  
+  return;
+}
+
+int main()
+{
+  // Host allocation
+  MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData));
+  assert(buffer != NULL);
+  MyData *const restrict A     = buffer;
+  MyData *const restrict B     = A + SIZE;
+  MyData *const restrict C_CPU = B + SIZE;
+  MyData *const restrict C_GPU = C_CPU + SIZE;
+
+  // Spawning 2 host threads
+  #pragma omp parallel num_threads(2)
+  {
+    // Evaluate the Dgemm on the host
+    #pragma omp single nowait
+    {
+      // allowing nested parallelism
+      omp_set_max_active_levels(2);
+      
+      for (int loop=0 ; loop<LOOP ; loop++)
+	{
+	  InitHost(A, B, C_CPU, &thr[0]);
+	  HostDgemm(A, B, C_CPU, ALPHA, BETA, &thr[1]);
+	}
+    } // omp single
+
+    #pragma omp single nowait
+    {
+      // Initialize cuBLAS library
+      cublasHandle_t handle;
+      cublasCreate(&handle);
+
+      // Allocate A, B, C on the device using CUDA API
+      MyData *d_buffer = NULL;
+      cudaMalloc((void **)&d_buffer, (3 * SIZE * sizeof(MyData)));
+      assert(d_buffer != NULL);
+      MyData *const restrict d_A = d_buffer;
+      MyData *const restrict d_B = d_A + SIZE;
+      MyData *const restrict d_C = d_B + SIZE;
+
+      // get the default device
+      const int dev = omp_get_default_device();
+
+      // Associate external device pointers
+      omp_target_associate_ptr(A,     d_A, (SIZE * sizeof(MyData)), 0, dev);
+      omp_target_associate_ptr(B,     d_B, (SIZE * sizeof(MyData)), 0, dev);
+      omp_target_associate_ptr(C_GPU, d_C, (SIZE * sizeof(MyData)), 0, dev);
+      
+      for (int loop=0 ; loop<LOOP ; loop++)
+	{
+	  // Init device with blocking omp target directive
+	  InitDev(A, B, C_GPU);
+
+	  // Apply DGEMM using device pointers directly
+	  const MyData alpha = ALPHA;
+	  const MyData beta  = BETA;
+
+	  double start = thread_time();
+	  cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
+		      &alpha, 
+		      d_A, N, 
+		      d_B, N, 
+		      &beta, 
+		      d_C, N);
+
+	  // CUDA synchronization point
+	  cudaDeviceSynchronize();
+	  _time[DEV][KERNEL] += (thread_time() - start);
+
+	  // Fetch data from the device and deallocate
+	  start = thread_time();
+	  cudaMemcpy(C_GPU, d_C, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
+	  _time[DEV][DATA] += (thread_time() - start);
+	} // LOOP
+
+      // release pointer association
+      omp_target_disassociate_ptr(A, dev);
+      omp_target_disassociate_ptr(B, dev);
+      omp_target_disassociate_ptr(C_GPU, dev);
+      
+      // deallocate device's memory
+      cudaFree(d_buffer);
+
+      cublasDestroy(handle);
+    } // omp single
+  } // synchronization point
+
+  check(C_CPU, C_GPU);
+
+  free(buffer);
+
+  printf("\n\t Matrix size: %d x %d\n", N, N);
+
+  printf("\n\t Host execution time:");
+  printf("\n\t\t Init      : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]);
+  printf("\n\t\t Dgemm     : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]);
+
+  printf("\n\t Device execution time:");
+  printf("\n\t\t Init      : %lg [s]", _time[DEV][INIT]/LOOP);
+  printf("\n\t\t Dgemm     : %lg [s]", _time[DEV][KERNEL]/LOOP);
+  printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP);
+  
+  return 0;
+}
diff --git a/cuda-omp/hybrid/hybrid_cuda_omp.c b/cuda-omp/hybrid/hybrid_cuda_omp.c
deleted file mode 100644
index 823be5dba91c9d5d13dc7dc146a609ac862f2497..0000000000000000000000000000000000000000
--- a/cuda-omp/hybrid/hybrid_cuda_omp.c
+++ /dev/null
@@ -1,159 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Passing OpenMP data to cuBlas.
-//
-// Author: David Goz
-// mail  : david.goz@inaf.it
-// date  : 02.09.2024
-// code tested using nvhpc
-//
-// - Compile the code:
-//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
-//         hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
-// - Run the code:
-//   $ export OMP_TARGET_OFFLOAD=mandatory
-//   $ ./hybrid_cuda_omp
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <assert.h>
-#include <float.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define N     512
-#define SIZE  ((N) * (N))
-#define ALPHA 1.0
-#define BETA  0.0
-
-typedef double MyData;
-
-void InitHost(MyData *const restrict A,
-	      MyData *const restrict B,
-	      MyData *const restrict C)
-{
-  //#pragma omp parallel for collapse(2)
-  for (int i=0 ; i<N ; i++)
-    for (int j=0 ; j<N ; j++)
-      {
-	A[(i * N) + j] = 1.0;
-	B[(i * N) + j] = 2.0;
-	C[(i * N) + j] = 0.0;
-      } 
-}
-
-void InitDev(MyData *const restrict A,
-	     MyData *const restrict B,
-	     MyData *const restrict C)
-{
- #pragma omp target teams loop collapse(2)
-  for (int i=0 ; i<N ; i++)
-    for (int j=0 ; j<N ; j++)
-      {
-	A[(i * N) + j] = 1.0;
-	B[(i * N) + j] = 2.0;
-	C[(i * N) + j] = 0.0;
-      }
-
-  return;
-}
-
-void HostDgemm(MyData *const restrict A,
-	       MyData *const restrict B,
-	       MyData *const restrict C,
-	       const MyData           alpha,
-	       const MyData           beta)
-{
-  // C = alpha * A * B + beta * C;
-
-  // naive calculation
-  //  #pragma omp parallel for collapse(2)
-  for (int i=0 ; i<N ; i++)
-    for (int j=0 ; j<N ; j++)
-      {
-	MyData sum = 0.0;
-	for (int k=0 ; k<N ; k++)
-	  sum += A[(i * N) + k] * B[(k * N) + j];
-
-	C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]);
-      }
-
-  return;
-}
-
-void check(MyData *const restrict host_array,
-	   MyData *const restrict dev_array)
-{
-  int flag = 0;
-  for (size_t i=0 ; i<SIZE ; i++)
-    flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);
-
-  if (!flag)
-    printf("\n\t Result OK");
-  else
-    printf("\n\t Result wrong");
-  
-  return;
-}
-
-int main()
-{
-  // Host allocation
-  MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData));
-  assert(buffer != NULL);
-  MyData *const restrict A  = buffer;
-  MyData *const restrict B  = A + SIZE;
-  MyData *const restrict C  = B + SIZE;
-  MyData *const restrict CC = C + SIZE;
-
-  // Spawning 2 host threads
-  #pragma omp parallel num_threads(2)
-  {
-    // Evaluate the Dgemm on the host
-    #pragma omp single nowait
-    {
-      InitHost(A, B, CC);
-      HostDgemm(A, B, CC, ALPHA, BETA);
-    } // omp single
-
-    #pragma omp single nowait
-    {
-      // Initialize cuBLAS library
-      cublasHandle_t handle;
-      cublasCreate(&handle);
-      
-      // Allocate A, B, C on the device
-      #pragma omp target enter data map(alloc: A[0:SIZE], B[0:SIZE], C[0:SIZE])
-
-      // Init device with blocking omp target directive
-      InitDev(A, B, C);
-
-      // Define a target data region where A, B, and C pointers
-      // refer to device's address space
-      #pragma omp target data use_device_addr(A, B, C)
-      {
-	MyData const alpha = ALPHA;
-	MyData const beta  = BETA;
-	
-	cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
-		    &alpha, A, N, B, N, &beta, C, N);
-
-	// CUDA synchronization point
-	cudaDeviceSynchronize();
-      }
-
-      // Fetch data from the device and deallocate
-      #pragma omp target exit data map(from: C[0:SIZE]) map(delete: A[0:SIZE], B[0:SIZE])
-      
-      cublasDestroy(handle);
-    } // omp single
-  } // synchronization point
-
-  check(CC, C);
-  
-  free(buffer);
-  
-  return 0;
-}
diff --git a/cuda-omp/hybrid/hybrid_omp_cublas.c b/cuda-omp/hybrid/hybrid_omp_cublas.c
new file mode 100644
index 0000000000000000000000000000000000000000..126eef389924cfbb0aeb0c696730d4b019716fa2
--- /dev/null
+++ b/cuda-omp/hybrid/hybrid_omp_cublas.c
@@ -0,0 +1,268 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Passing OpenMP data to foreign runtime (cuBLAS library).
+//
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 02.09.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//      - using nvc
+//          $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
+//                 hybrid_omp_cuda.c -o hybrid_omp_cuda -lm -lcudart -lcublas
+//      - using clang
+//          $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda 
+//                 hybrid_omp_cuda.c -o hybrid_omp_cuda -lm -lcudart -lcublas
+//
+// - Run the code:
+//   $ export OMP_TARGET_OFFLOAD=mandatory
+//   $ ./hybrid_omp_cuda
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <assert.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <omp.h>
+
+#define N       2048
+#define SIZE    ((N) * (N))
+#define ALPHA   1.0
+#define BETA    0.0
+#define HOST    0
+#define DEV     1
+#define LOOP    10
+#define INIT    0
+#define KERNEL  1
+#define DATA    2
+
+typedef double MyData;
+
+static double _time[2][3];
+static int thr[2];
+
+double process_time()
+{
+  struct timespec ts;
+  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
+
+  return ret;
+}
+
+double thread_time()
+{
+  struct timespec ts;
+  clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
+
+  return ret;
+}
+
+void InitHost(MyData *const restrict A,
+	      MyData *const restrict B,
+	      MyData *const restrict C,
+	      int    *const restrict thr)
+{
+  double start;
+
+  #pragma omp parallel 
+  {
+    #pragma omp barrier
+    #pragma omp master
+    {
+      *thr = omp_get_num_threads();
+      start = thread_time();
+    }
+   #pragma omp barrier
+
+    #pragma omp for collapse(2)
+    for (int i=0 ; i<N ; i++)
+      for (int j=0 ; j<N ; j++)
+	{
+	  A[(i * N) + j] = 1.0;
+	  B[(i * N) + j] = 2.0;
+	  C[(i * N) + j] = 0.0;
+	}
+
+    #pragma omp master
+    {
+      _time[HOST][INIT] += (thread_time() - start); 
+    }
+  } // omp parallel
+
+  return;
+}
+
+void InitDev(MyData *const restrict A,
+	     MyData *const restrict B,
+	     MyData *const restrict C)
+{
+  const double start = thread_time();
+
+ #pragma omp target teams loop collapse(2)
+  for (int i=0 ; i<N ; i++)
+    for (int j=0 ; j<N ; j++)
+      {
+	A[(i * N) + j] = 1.0;
+	B[(i * N) + j] = 2.0;
+	C[(i * N) + j] = 0.0;
+      }
+
+  _time[DEV][INIT] += (thread_time() - start);
+
+  return;
+}
+
+void HostDgemm(MyData *const restrict A,
+	       MyData *const restrict B,
+	       MyData *const restrict C,
+	       const MyData           alpha,
+	       const MyData           beta,
+	       int    *const restrict thr)
+{
+  // C = alpha * A * B + beta * C;
+
+  double start;
+
+  // naive calculation
+ #pragma omp parallel
+  {
+    #pragma omp barrier
+    #pragma omp master
+    {
+      *thr = omp_get_num_threads();
+      start = thread_time();
+    }
+    #pragma omp barrier
+
+    #pragma omp for collapse(2)
+    for (int i=0 ; i<N ; i++)
+      for (int j=0 ; j<N ; j++)
+	{
+	  MyData sum = 0.0;
+	  for (int k=0 ; k<N ; k++)
+	    sum += A[(i * N) + k] * B[(k * N) + j];
+  
+	  C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]);
+	}
+
+    #pragma omp master
+    {
+      _time[HOST][KERNEL] += (thread_time() - start);
+    }
+  } // omp parallel
+
+  return;
+}
+
+void check(MyData *const restrict host_array,
+	   MyData *const restrict dev_array)
+{
+  int flag = 0;
+  for (size_t i=0 ; i<SIZE ; i++)
+    flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);
+
+  if (!flag)
+    printf("\n\t Result OK \n");
+  else
+    printf("\n\t Result wrong \n");
+  
+  return;
+}
+
+int main()
+{
+  // Host allocation
+  MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData));
+  assert(buffer != NULL);
+  MyData *const restrict A  = buffer;
+  MyData *const restrict B  = A + SIZE;
+  MyData *const restrict C  = B + SIZE;
+  MyData *const restrict CC = C + SIZE;
+
+  // Spawning 2 host threads
+  #pragma omp parallel num_threads(2)
+  {
+    // Evaluate the Dgemm on the host
+    #pragma omp single nowait
+    {
+      // allowing nested parallelism
+      omp_set_max_active_levels(2);
+      
+      for (int loop=0 ; loop<LOOP ; loop++)
+	{
+	  InitHost(A, B, CC, &thr[0]);
+	  HostDgemm(A, B, CC, ALPHA, BETA, &thr[1]);
+	}
+    } // omp single
+
+    #pragma omp single nowait
+    {
+      // Initialize cuBLAS library
+      cublasHandle_t handle;
+      cublasCreate(&handle);
+
+      // Allocate A, B, C on the device
+      #pragma omp target enter data map(alloc: A[0:SIZE], B[0:SIZE], C[0:SIZE])
+
+      for (int loop=0 ; loop<LOOP ; loop++)
+	{
+	  // Init device with blocking omp target directive
+	  InitDev(A, B, C);
+
+	  // Define a target data region where A, B, and C pointers
+	  // refer to device's address space
+          #pragma omp target data use_device_ptr(A, B, C)
+	  {
+	    const MyData alpha = ALPHA;
+	    const MyData beta  = BETA;
+
+	    const double start = thread_time();
+	    cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
+			&alpha, 
+			A, N, 
+			B, N, 
+			&beta, 
+			C, N);
+
+	    // CUDA synchronization point
+	    cudaDeviceSynchronize();
+	    _time[DEV][KERNEL] += (thread_time() - start);
+	  }
+
+	  // Fetch data from the device and deallocate
+	  const double start = thread_time();
+          #pragma omp target update from (C[0:SIZE])
+	  _time[DEV][DATA] += (thread_time() - start);
+	} // LOOP
+      
+      // deallocate device's memory
+      #pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C[0:SIZE])
+
+      cublasDestroy(handle);
+    } // omp single
+  } // synchronization point
+
+  check(CC, C);
+
+  free(buffer);
+
+  printf("\n\t Matrix size: %d x %d\n", N, N);
+
+  printf("\n\t Host execution time:");
+  printf("\n\t\t Init      : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]);
+  printf("\n\t\t Dgemm     : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]);
+
+  printf("\n\t Device execution time:");
+  printf("\n\t\t Init      : %lg [s]", _time[DEV][INIT]/LOOP);
+  printf("\n\t\t Dgemm     : %lg [s]", _time[DEV][KERNEL]/LOOP);
+  printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP);
+  
+  return 0;
+}
diff --git a/cuda-omp/hybrid/hybrid_omp_cuda.cu b/cuda-omp/hybrid/hybrid_omp_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..221044bcc4b57ec9faa8e418a5ea294a82f18de7
--- /dev/null
+++ b/cuda-omp/hybrid/hybrid_omp_cuda.cu
@@ -0,0 +1,281 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Passing OpenMP data to foreign runtime (cuBLAS library).
+//
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 03.09.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//      - using nvc
+//          $ nvc++ -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
+//                  hybrid_omp_cuda.cu -o hybrid_omp_cuda -lm && ./hybrid_omp_cuda
+//
+//      - using clang
+//          $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda 
+//                 hybrid_omp_cuda.c -o hybrid_omp_cuda -lm
+//
+// - Run the code:
+//   $ export OMP_TARGET_OFFLOAD=mandatory
+//   $ ./hybrid_omp_cuda
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cuda.h>
+#include <math.h>
+#include <assert.h>
+#include <float.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <omp.h>
+
+#define N         1024
+#define SIZE      ((N) * (N))
+#define HOST      0
+#define DEV       1
+#define LOOP      10
+#define INIT      0
+#define KERNEL    1
+#define DATA      2
+#define BLOCKSIZE 1024
+
+typedef double MyData;
+
+static double _time[2][3];
+static int thr[2];
+
+double process_time()
+{
+  struct timespec ts;
+  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
+
+  return ret;
+}
+
+double thread_time()
+{
+  struct timespec ts;
+  clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
+
+  return ret;
+}
+
+void InitHost(MyData *const restrict A,
+	      MyData *const restrict B,
+	      MyData *const restrict C,
+	      int    *const restrict thr)
+{
+  double start;
+
+  #pragma omp parallel 
+  {
+    #pragma omp barrier
+    #pragma omp master
+    {
+      *thr = omp_get_num_threads();
+      start = thread_time();
+    }
+   #pragma omp barrier
+
+    #pragma omp for collapse(2)
+    for (int i=0 ; i<N ; i++)
+      for (int j=0 ; j<N ; j++)
+	{
+	  A[(i * N) + j] = 1.0;
+	  B[(i * N) + j] = (1.0 / M_PI);
+	  C[(i * N) + j] = 0.0;
+	}
+
+    #pragma omp master
+    {
+      _time[HOST][INIT] += (thread_time() - start); 
+    }
+  } // omp parallel
+
+  return;
+}
+
+void InitDev(MyData *const restrict A,
+	     MyData *const restrict B,
+	     MyData *const restrict C)
+{
+  const double start = thread_time();
+
+ #pragma omp target teams loop collapse(2) is_device_ptr(A, B, C)
+  for (int i=0 ; i<N ; i++)
+    for (int j=0 ; j<N ; j++)
+      {
+	A[(i * N) + j] = 1.0;
+	B[(i * N) + j] = (1.0 / M_PI);
+	C[(i * N) + j] = 0.0;
+      }
+
+  _time[DEV][INIT] += (thread_time() - start);
+
+  return;
+}
+
+void HostMM(MyData *const restrict A,
+	    MyData *const restrict B,
+	    MyData *const restrict C,
+	    int    *const restrict thr)
+{
+  // C = alpha * A * B + beta * C;
+
+  double start;
+
+  // naive calculation
+  #pragma omp parallel
+  {
+    #pragma omp barrier
+    #pragma omp master
+    {
+      *thr = omp_get_num_threads();
+      start = thread_time();
+    }
+    #pragma omp barrier
+
+    #pragma omp for collapse(2)
+    for (int i=0 ; i<N ; i++)
+      for (int j=0 ; j<N ; j++)
+	{
+	  MyData sum = 0.0;
+	  for (int k=0 ; k<N ; k++)
+	    sum += A[(i * N) + k] * B[(k * N) + j];
+  
+	  C[(i * N) + j] = sum;
+	}
+
+    #pragma omp master
+    {
+      _time[HOST][KERNEL] += (thread_time() - start);
+    }
+  } // omp parallel
+
+  return;
+}
+
+__global__ void DevMM(MyData *const restrict A,
+		      MyData *const restrict B,
+		      MyData *const restrict C,
+		      const int              n)
+{
+  const int size = (n * n);
+  const int globalID = threadIdx.x + (blockIdx.x * blockDim.x);
+
+  if (globalID >= size)
+    return;
+
+  const int i = (globalID / N);
+  const int j = (globalID % N);
+
+  MyData sum = 0.0;
+  for (int k=0 ; k<N ; k++)
+    sum += (A[(i * N) + k] * B[(k * N) + j]);
+
+  C[(i * N) + j] = sum;
+  
+  return;
+}
+
+void check(MyData *const restrict host_array,
+	   MyData *const restrict dev_array)
+{
+  int flag = 0;
+  for (size_t i=0 ; i<SIZE ; i++)
+    flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);
+
+  if (!flag)
+    printf("\n\t Result OK \n");
+  else
+    printf("\n\t Result wrong \n");
+  
+  return;
+}
+
+int main()
+{
+  // Host allocation
+  MyData *h_buffer = (MyData *)malloc(2 * SIZE * sizeof(MyData));
+  assert(h_buffer != NULL);
+  MyData *const restrict C_HOST = h_buffer;
+  MyData *const restrict C_DEV  = C_HOST + SIZE;
+  
+  // Spawning 2 host threads
+  #pragma omp parallel num_threads(2)
+  {
+    // Evaluate the Dgemm on the host
+    #pragma omp single nowait
+    {
+      // allowing nested parallelism
+      omp_set_max_active_levels(2);
+
+      MyData *tmp = (MyData *)malloc(2 * SIZE * sizeof(MyData));
+      MyData *const restrict A = tmp;
+      MyData *const restrict B = A + SIZE;
+      
+      for (int loop=0 ; loop<LOOP ; loop++)
+	{
+	  InitHost(A, B, C_HOST, &thr[0]);
+	  HostMM(A, B, C_HOST, &thr[1]);
+	}
+
+      free(tmp);
+    } // omp single
+
+    #pragma omp single nowait
+    {
+      // Device allocation
+      const int dev = omp_get_default_device();
+      const int host = omp_get_initial_device();
+      MyData *d_buffer = (MyData *)omp_target_alloc((3 * SIZE * sizeof(MyData)), dev);
+      assert(d_buffer != NULL);
+      MyData *const restrict d_A = d_buffer;
+      MyData *const restrict d_B = d_A + SIZE;
+      MyData *const restrict d_C = d_B + SIZE;
+
+      const dim3 nblock = {((SIZE + BLOCKSIZE - 1) / BLOCKSIZE), 1, 1};
+      const dim3 block  = {BLOCKSIZE, 1, 1};
+      
+      for (int loop=0 ; loop<LOOP ; loop++)
+	{
+	  // Init device with blocking omp target directive
+	  InitDev(d_A, d_B, d_C);
+
+	  double start = thread_time();
+	  DevMM<<< nblock, block >>>(d_A, d_B, d_C, N);
+	  // CUDA synchronization point
+	  cudaDeviceSynchronize();
+	  _time[DEV][KERNEL] += (thread_time() - start);
+
+	  // Fetch data from the device and deallocate
+	  start = thread_time();
+	  omp_target_memcpy(C_DEV, d_C, (SIZE * sizeof(MyData)), 0, 0, host, dev);
+	  _time[DEV][DATA] += (thread_time() - start);
+	} // LOOP
+      
+      // deallocate device's memory
+      omp_target_free(d_buffer, dev);
+    } // omp single
+  } // synchronization point
+
+  check(C_HOST, C_DEV);
+
+  free(h_buffer);
+
+  printf("\n\t Matrix size: %d x %d\n", N, N);
+
+  printf("\n\t Host execution time:");
+  printf("\n\t\t Init      : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]);
+  printf("\n\t\t Dgemm     : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]);
+
+  printf("\n\t Device execution time:");
+  printf("\n\t\t Init      : %lg [s]", _time[DEV][INIT]/LOOP);
+  printf("\n\t\t Dgemm     : %lg [s]", _time[DEV][KERNEL]/LOOP);
+  printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP);
+  
+  return 0;
+}
diff --git a/cuda-omp/omp/miscellaneous/host_device_address_space.c b/cuda-omp/omp/miscellaneous/host_device_address_space.c
new file mode 100644
index 0000000000000000000000000000000000000000..7b2c81b475c83ee209721735ca3b86b69ef9d75b
--- /dev/null
+++ b/cuda-omp/omp/miscellaneous/host_device_address_space.c
@@ -0,0 +1,76 @@
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 03.09.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   - nvc complains about the '#pragma omp target has_device_addr'
+//   - clang works
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+static int answer = 42;
+#define SIZE 3
+
+int main()
+{
+  int *A = (int *)malloc(SIZE * sizeof(int));
+  assert(A != NULL);
+  memset(A, 0, (SIZE * sizeof(int)));
+  
+  printf("\n\t Host address of 'answer' [value: %d] is: %p", answer, &answer);
+  printf("\n\t Host address of 'A' is: %p", A);
+
+  // Map 'answer' to the device
+  #pragma omp target data map(alloc: answer, A[0:SIZE])
+  {
+    #pragma omp target update to(answer, A[0:SIZE])
+    
+    // Update 'answer' on the device
+    #pragma omp target map(answer)
+    {
+      answer += 1;
+      for (int i=0 ; i<SIZE ; i++)
+	A[i] += (i + 1);
+    }
+
+    #pragma omp target update from(answer, A[0:SIZE])
+    
+    printf("\n\t Address of 'answer' [value: %d] in target data region is: %p",
+  	   answer, &answer);
+    printf("\n\t Address of 'A' in target data region is: %p", A);
+    printf("\n\t Value of A:");
+    for (int i=0 ; i<SIZE ; i++)
+      printf("\n\t\t A[%d] = %d", i, A[i]);
+    printf("\n");
+
+    #pragma omp target data use_device_addr(answer) use_device_ptr(A)
+    {
+      #pragma omp target has_device_addr(answer)
+      {
+	answer += 1;
+	for (int i=0 ; i<SIZE ; i++)
+	  A[i] += (i + 1);
+      }
+      
+      printf("\n\t Device address of 'answer' is: %p", &answer);
+      printf("\n\t Device address of 'A' is: %p", A);
+    }
+
+    #pragma omp target update from(answer, A[0:SIZE])
+    printf("\n\t Last value of 'answer' is: %d", answer);
+    printf("\n\t Last value if 'A' is:");
+    for (int i=0 ; i<SIZE ; i++)
+      printf("\n\t\t A[%d] = %d", i, A[i]);
+    printf("\n\n");
+  } // omp target data
+
+  free(A);
+  
+  return 0;
+}
diff --git a/cuda-omp/omp/miscellaneous/structure.c b/cuda-omp/omp/miscellaneous/structure.c
index fc0041c02e55eb438685ab113ca2da41b7b658a0..7147e2d55e31525f273cc98b48da0dd01cfecfea 100644
--- a/cuda-omp/omp/miscellaneous/structure.c
+++ b/cuda-omp/omp/miscellaneous/structure.c
@@ -86,6 +86,6 @@ int main()
   free(A.ptr);
   free(B.ptr);
   free(C.ptr);
-
+  
   return 0;
 }
diff --git a/cuda-omp/omp/miscellaneous/structure_routines.c b/cuda-omp/omp/miscellaneous/structure_routines.c
new file mode 100644
index 0000000000000000000000000000000000000000..91299044a7b23ea5f08729d8803663ef3b973e80
--- /dev/null
+++ b/cuda-omp/omp/miscellaneous/structure_routines.c
@@ -0,0 +1,126 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 31.07.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v structure_routines.c -o structure_routines_omp
+// - Run the code:
+//   $ export OMP_TARGET_OFFLOAD=mandatory
+//   $ ./structure_routines_omp
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <omp.h>
+
+#define SIZE   8
+#define SIZE_2 (SIZE / 2)
+typedef double MyData;
+
+typedef struct my_span
+{
+  size_t N;
+  MyData *A;
+  MyData *B;
+} span;
+
+span d_S;
+#pragma omp declare target(d_S)
+
+void allocate(      span  *my_struct,
+	      const size_t size)
+{
+  /* allocate the buffer on the host memory */
+  my_struct->A = (MyData *)calloc(size, sizeof(MyData));
+  my_struct->B = (MyData *)calloc(size, sizeof(MyData));
+  assert((my_struct->A != NULL) && (my_struct->B != NULL));
+  my_struct->N = size;
+
+  for (size_t i=0 ; i<size; i++)
+    {
+      my_struct->A[i] = (MyData)(3 * i);
+      my_struct->B[i] = (MyData)(2 * i);
+    }
+  
+  return;
+}
+
+void print(const span *const ptr,
+	   const char *const string)
+{
+  int flag = 0;
+  
+  printf("\n");
+  for (int i=0 ; i<ptr->N ; i++)
+    {
+      printf("\n\t %s[%d] = %lg", string, i, ptr->A[i]);
+      printf("\n\t %s[%d] = %lg", string, i, ptr->B[i]);
+      flag = (((ptr->A[i] != 0) || (ptr->B[i] != 0)) ? 1 : flag);
+    }
+  printf("\n");
+
+  if (flag)
+    printf("\n\t Result wrong \n\n");
+  else
+    printf("\n\t Result OK \n\n");
+  
+  return;
+}
+
+int main()
+{
+  /* host allocation */
+  span h_S;
+  allocate(&h_S, SIZE);
+  
+  /* allocating GPU memory using OMP routines */
+  const int dev  = omp_get_default_device();
+  const int host = omp_get_initial_device();
+  MyData *d_buffer = (double *)omp_target_alloc(2 * SIZE * sizeof(MyData), dev);
+  assert(d_buffer != NULL);
+
+  /* set the pointers within the GPU */
+#pragma omp target is_device_ptr(d_buffer) device(dev)
+  {
+    d_S.N = SIZE;
+    d_S.A = d_buffer;
+    d_S.B = d_buffer + SIZE;
+  }
+
+  /* copy data to the GPU */
+  omp_target_memcpy(d_buffer, h_S.A, (SIZE * sizeof(MyData)), 0, 0, dev, host);
+  omp_target_memcpy(d_buffer, h_S.B, (SIZE * sizeof(MyData)), (SIZE * sizeof(MyData)), 0, dev, host);
+  
+  /* perform the calculation on the GPU */
+#pragma omp target device(dev)
+  {
+    #pragma omp loop
+    for (size_t i=0 ; i<d_S.N ; i++)
+      {
+	d_S.A[i] -= (MyData)(3 * i);
+	d_S.B[i] -= (MyData)(2 * i);
+      }
+  }
+
+  /* copy data from the GPU */
+  omp_target_memcpy(h_S.A, d_buffer, (SIZE * sizeof(MyData)),
+		    0, 0, host, dev);
+  omp_target_memcpy(h_S.B, d_buffer, (SIZE * sizeof(MyData)),
+  		    0, (SIZE * sizeof(MyData)), host, dev);
+
+  /* check the data */
+  print(&h_S, "d_S");
+
+  /* free GPU memory */
+  omp_target_free(d_buffer, dev);
+
+  /* free host memory */
+  free(h_S.A);
+  free(h_S.B);
+  
+  return 0;
+}