diff --git a/cuda-omp/hybrid/hybrid_cuda_omp.c b/cuda-omp/hybrid/hybrid_cuda_omp.c
new file mode 100644
index 0000000000000000000000000000000000000000..823be5dba91c9d5d13dc7dc146a609ac862f2497
--- /dev/null
+++ b/cuda-omp/hybrid/hybrid_cuda_omp.c
@@ -0,0 +1,159 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Passing OpenMP data to cuBlas.
+//
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 02.09.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
+//         hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
+// - Run the code:
+//   $ export OMP_TARGET_OFFLOAD=mandatory
+//   $ ./hybrid_cuda_omp
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <assert.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N     512
+#define SIZE  ((N) * (N))
+#define ALPHA 1.0
+#define BETA  0.0
+
+typedef double MyData;
+
+void InitHost(MyData *const restrict A,
+	      MyData *const restrict B,
+	      MyData *const restrict C)
+{
+  //#pragma omp parallel for collapse(2)
+  for (int i=0 ; i<N ; i++)
+    for (int j=0 ; j<N ; j++)
+      {
+	A[(i * N) + j] = 1.0;
+	B[(i * N) + j] = 2.0;
+	C[(i * N) + j] = 0.0;
+      } 
+}
+
+void InitDev(MyData *const restrict A,
+	     MyData *const restrict B,
+	     MyData *const restrict C)
+{
+ #pragma omp target teams loop collapse(2)
+  for (int i=0 ; i<N ; i++)
+    for (int j=0 ; j<N ; j++)
+      {
+	A[(i * N) + j] = 1.0;
+	B[(i * N) + j] = 2.0;
+	C[(i * N) + j] = 0.0;
+      }
+
+  return;
+}
+
+void HostDgemm(MyData *const restrict A,
+	       MyData *const restrict B,
+	       MyData *const restrict C,
+	       const MyData           alpha,
+	       const MyData           beta)
+{
+  // C = alpha * A * B + beta * C;
+
+  // naive calculation
+  //  #pragma omp parallel for collapse(2)
+  for (int i=0 ; i<N ; i++)
+    for (int j=0 ; j<N ; j++)
+      {
+	MyData sum = 0.0;
+	for (int k=0 ; k<N ; k++)
+	  sum += A[(i * N) + k] * B[(k * N) + j];
+
+	C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]);
+      }
+
+  return;
+}
+
+void check(MyData *const restrict host_array,
+	   MyData *const restrict dev_array)
+{
+  int flag = 0;
+  for (size_t i=0 ; i<SIZE ; i++)
+    flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);
+
+  if (!flag)
+    printf("\n\t Result OK");
+  else
+    printf("\n\t Result wrong");
+  
+  return;
+}
+
+int main()
+{
+  // Host allocation
+  MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData));
+  assert(buffer != NULL);
+  MyData *const restrict A  = buffer;
+  MyData *const restrict B  = A + SIZE;
+  MyData *const restrict C  = B + SIZE;
+  MyData *const restrict CC = C + SIZE;
+
+  // Spawning 2 host threads
+  #pragma omp parallel num_threads(2)
+  {
+    // Evaluate the Dgemm on the host
+    #pragma omp single nowait
+    {
+      InitHost(A, B, CC);
+      HostDgemm(A, B, CC, ALPHA, BETA);
+    } // omp single
+
+    #pragma omp single nowait
+    {
+      // Initialize cuBLAS library
+      cublasHandle_t handle;
+      cublasCreate(&handle);
+      
+      // Allocate A, B, C on the device
+      #pragma omp target enter data map(alloc: A[0:SIZE], B[0:SIZE], C[0:SIZE])
+
+      // Init device with blocking omp target directive
+      InitDev(A, B, C);
+
+      // Define a target data region where A, B, and C pointers
+      // refer to device's address space
+      #pragma omp target data use_device_addr(A, B, C)
+      {
+	MyData const alpha = ALPHA;
+	MyData const beta  = BETA;
+	
+	cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
+		    &alpha, A, N, B, N, &beta, C, N);
+
+	// CUDA synchronization point
+	cudaDeviceSynchronize();
+      }
+
+      // Fetch data from the device and deallocate
+      #pragma omp target exit data map(from: C[0:SIZE]) map(delete: A[0:SIZE], B[0:SIZE])
+      
+      cublasDestroy(handle);
+    } // omp single
+  } // synchronization point
+
+  check(CC, C);
+  
+  free(buffer);
+  
+  return 0;
+}
diff --git a/cuda-omp/omp/miscellaneous/asynchronous.c b/cuda-omp/omp/miscellaneous/asynchronous.c
new file mode 100644
index 0000000000000000000000000000000000000000..974861e154f879a5b6f3c50c78eb6e24ee2cb96e
--- /dev/null
+++ b/cuda-omp/omp/miscellaneous/asynchronous.c
@@ -0,0 +1,110 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Splitting the asynchronous vector addition task graph across four devices
+//
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 28.08.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v asynchronous.c -o asynchronous_omp
+// - Run the code:
+//   $ export OMP_TARGET_OFFLOAD=mandatory
+//   $ ./asynchronous_omp
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+#include <assert.h>
+
+typedef int MyData;
+
+#define NDEBUG
+
+void check(const MyData *const C,
+	   const size_t        size)
+{
+  int flag = 0;
+  for (size_t i=0 ; i<size ; i++)
+    flag = ((C[i] != 98) ? 1 : flag);
+
+  if (flag)
+    printf("\n\t Result wrong \n");
+  else
+    printf("\n\t Result OK \n");
+
+  return;
+}
+
+int main()
+{
+    // alloc data on the device
+  const int NumDev = omp_get_num_devices();
+
+  if (NumDev != 4)
+    {
+      printf("\n\t The program runs using 4 GPUs... aborting...\n");
+      exit(EXIT_FAILURE);
+    }
+
+  const int size = 1000000;
+  MyData *buffer = (MyData *)malloc(3 * size * sizeof(MyData));
+  assert(buffer != NULL);
+  MyData *const restrict A = buffer;
+  MyData *const restrict B = A + size;
+  MyData *const restrict C = B + size;
+
+  // init A on GPU 0
+  #pragma omp target nowait                 \
+                     map(tofrom: A[0:size]) \
+                     depend(out: A[0:size]) \
+                     device(0)
+  {
+    #pragma omp loop
+    for (int i=0 ; i<size ; i++)
+      A[i] = (MyData)i;
+  } // device 0
+  
+  // init B on GPU 1
+  #pragma omp target nowait                 \
+                     map(tofrom: B[0:size]) \
+                     depend(out: B[0:size]) \
+                     device(1)
+  {
+    #pragma omp loop
+    for (int i=0 ; i<size ; i++)
+      B[i] = (MyData)-i;
+  } // device 1
+
+  // init C on GPU 2
+  #pragma omp target nowait                 \
+                     map(tofrom: C[0:size]) \
+                     depend(out: C[0:size]) \
+                     device(2)
+  {
+    #pragma omp loop
+    for (int i=0 ; i<size ; i++)
+      C[i] = 99;
+  } // device 2
+
+  // perform the calculation on GPU 3
+  #pragma omp target nowait                                                    \
+                     map(to: A[0:size], B[0:size]) map(tofrom: C[0:size])      \
+                     depend(in: A[0:size], B[0:size]) depend(inout: C[0:size]) \
+                     device(3)
+  {
+    #pragma omp loop
+    for (int i=0 ; i<size ; i++)
+      C[i] += (A[i] + B[i] - 1);
+  } // device 3
+
+  #pragma omp task depend(in: C[0:size])
+  check(C, size);
+  
+  free(buffer);
+  
+  return 0;
+}
diff --git a/cuda-omp/omp/miscellaneous/dependencies.c b/cuda-omp/omp/miscellaneous/dependencies.c
new file mode 100644
index 0000000000000000000000000000000000000000..17fcd2d221074954bce21cd40d2b9c2625ddf793
--- /dev/null
+++ b/cuda-omp/omp/miscellaneous/dependencies.c
@@ -0,0 +1,95 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Task dependencies:
+// example of synchronization by defining explicit dependencies between the tasks.
+//
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 28.08.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v dependencies.c -o dependencies_omp
+// - Run the code:
+//   $ export OMP_TARGET_OFFLOAD=mandatory
+//   $ ./dependencies_omp
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+#include <assert.h>
+
+typedef int MyData;
+
+#define NDEBUG
+
+void check(const MyData *const C,
+	   const size_t        size)
+{
+  int flag = 0;
+  for (size_t i=0 ; i<size ; i++)
+    flag = ((C[i] != 0) ? 1 : flag);
+
+  if (flag)
+    printf("\n\t Result wrong \n");
+  else
+    printf("\n\t Result OK \n");
+
+  return;
+}
+
+int main()
+{
+  const int size = 1000000;
+
+  MyData *C = (MyData *)malloc(size * sizeof(MyData));
+  assert(C != NULL);
+
+  // alloc data on the device
+  const int dev_gpu = omp_get_default_device();
+  MyData *gpu_buffer = (MyData *)omp_target_alloc(2 * size * sizeof(MyData), dev_gpu);
+  assert(gpu_buffer != NULL);
+  MyData *A = gpu_buffer;
+  MyData *B = A + size;
+
+  // init C with random number
+  for (int i=0 ; i<size ; i++)
+    C[i] = rand() % size;
+  
+  #pragma omp target enter data map(to: C[0:size])
+  
+  // init A
+  #pragma omp target nowait depend(out: A[0:size]) is_device_ptr(A)
+  {
+    #pragma omp loop
+    for (int i=0 ; i<size; i++)
+      A[i] = i;
+  }
+
+  // init B
+  #pragma omp target nowait depend(out: B[0:size]) is_device_ptr(B)
+  {
+    #pragma omp loop
+    for (int i=0 ; i<size; i++)
+      B[i] = -i;
+  }
+  
+  // vector add
+ #pragma omp target nowait depend(in: A[0:size], B[0:size]) depend(out: C[0:size]) is_device_ptr(A, B)
+  {
+    #pragma omp loop
+    for (int i=0 ; i<size; i++)
+      C[i] = A[i] + B[i];
+  }
+  
+ #pragma omp target update from (C[0:size]) depend(in: C[0:size])
+
+  check(C, size);
+  omp_target_free(gpu_buffer, dev_gpu);
+  free(C);  
+  #pragma omp target exit data map(release: C[0:size])
+  
+  return 0;
+}
diff --git a/cuda-omp/omp/miscellaneous/multiple_devices.c b/cuda-omp/omp/miscellaneous/multiple_devices.c
index 4eb5cad58acadad0a75af467a9f68d45e7096a41..c8212b8ac0afbd84837a7c0d11db061ed3c5a9ee 100644
--- a/cuda-omp/omp/miscellaneous/multiple_devices.c
+++ b/cuda-omp/omp/miscellaneous/multiple_devices.c
@@ -1,3 +1,22 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 
+// Offload to multiple devices within the same node:
+// - using one OMP host-thread per device synchronously;
+// - using one OMP host-thread asynchronously.
+//
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 27.08.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v multiple_devices.c -o multiple_devices_omp
+// - Run the code:
+//   $ export OMP_TARGET_OFFLOAD=mandatory
+//   $ ./multiple_devices_omp
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <omp.h>
@@ -15,30 +34,10 @@ typedef int MyData;
 #error "N_PER_DEV < BLOCKSIZE"
 #endif
 
-#define NDEBUG
+#define TRUE  1
+#define FALSE 0
 
-void check(const MyData *const restrict vector_cpu,
-	   const MyData *const restrict vector_gpu,
-	   const size_t                 size)
-{
-  int flag = 0;
-  for (size_t i=0 ; i<size ; i++)
-{
-#if !defined(NDEBUG)
-    printf("\n\t vector_cpu[%zu] = %d - vector_gpu[%zu] = %d",
-           i, vector_cpu[i], i, vector_gpu[i]);
-#endif
-
-    flag = ((vector_cpu[i] != vector_gpu[i]) ? 1 : flag);
-}
-
-  if (flag)
-    printf("\n\t Result wrong \n");
-  else
-    printf("\n\t Result OK \n");
-
-  return;
-}
+#define NDEBUG
 
 void VectorAdd(const MyData *const restrict A,
 	       const MyData *const restrict B,
@@ -46,26 +45,27 @@ void VectorAdd(const MyData *const restrict A,
 	       const int                    offset,
 	       const int                    size,
                const int                    dev,
-               const int                    nblocks)
+               const int                    nblocks,
+	       const int                    async)
 {
- #pragma omp target                                                  \
-   teams num_teams(nblocks) thread_limit(BLOCKSIZE)                  \
-   map(to: A[offset:size], B[offset:size]) map(from: C[offset:size]) \
-   device(dev)
+#pragma omp target							\
+  teams num_teams(nblocks) thread_limit(BLOCKSIZE)			\
+  map(to: A[offset:size], B[offset:size]) map(from: C[offset:size])	\
+  device(dev)
   {
     const int team  = omp_get_team_num();
     const int team_start_index = (team * BLOCKSIZE) + offset;
     const int team_end_index   = team_start_index + BLOCKSIZE;
 
-    #pragma omp parallel num_threads(BLOCKSIZE)
+#pragma omp parallel num_threads(BLOCKSIZE)
     {
       const int localID = omp_get_thread_num();
       const int block   = omp_get_num_threads();
 
       int globalID = team_start_index + localID;
 
-     for (int index=globalID ; index<team_end_index ; index+=block)
-      	C[index] = A[index] + B[index];	
+      for (int index=globalID ; index<team_end_index ; index+=block)
+	C[index] = A[index] + B[index];	
 
 #if !defined(NDEBUG)
 
@@ -75,7 +75,7 @@ void VectorAdd(const MyData *const restrict A,
 #endif
     } // omp parallel
   } // omp target
-
+            
   return;
 }
 
@@ -84,6 +84,8 @@ int main()
   // get the number of the available devices
   const int NumDev = omp_get_num_devices();
 
+  const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE);
+  
   // global vector size
   const int size = (NumDev * N_PER_DEV);
   assert(size > 0);
@@ -102,7 +104,8 @@ int main()
       B[i] = rand() % N_PER_DEV;
       C_CPU[i] = A[i] + B[i];
     }
-  
+
+  // each device is managed by a single OMP thread
   #pragma omp parallel num_threads(NumDev)
   {
     // check
@@ -119,11 +122,22 @@ int main()
     
     const int tid    = omp_get_thread_num();
     const int offset = (tid * N_PER_DEV);
-    const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE);
-
-    VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks);
+    
+    VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks, FALSE);
   } // omp parallel
 
+  check(C_CPU, C_GPU, size);
+  memset(C_GPU, 0, (size * sizeof(MyData)));
+  
+  // one OMP thread manages asynchronously all the devices
+  for (int dev=0 ; dev<NumDev ; dev++)
+    {
+      const int offset = (dev * N_PER_DEV);
+      VectorAdd(A, B, C_GPU, offset, N_PER_DEV, dev, nblocks, TRUE);
+    }
+
+  // host-devices synchronization
+  #pragma omp taskwait
   check(C_CPU, C_GPU, size);
   
   free(buffer);
diff --git a/cuda-omp/omp/miscellaneous/task_reductions.c b/cuda-omp/omp/miscellaneous/task_reductions.c
new file mode 100644
index 0000000000000000000000000000000000000000..4200969f9684e8e8b467f1e5c1e7b891c376a538
--- /dev/null
+++ b/cuda-omp/omp/miscellaneous/task_reductions.c
@@ -0,0 +1,83 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 
+//
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 28.08.2024
+// code tested using nvhpc
+//
+// The code does not compile with nvc
+// The code compiles with clang
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+#include <assert.h>
+
+typedef int MyData;
+#define N_PER_DEV   1000000
+
+#define NDEBUG
+
+int main()
+{
+  // get the number of the available devices
+  const int NumDev = omp_get_num_devices();
+
+  // global vector size
+  const int size = (NumDev * N_PER_DEV);
+  assert(size > 0);
+
+  MyData *buffer = (MyData *)malloc(2 * size * sizeof(MyData));
+  assert(buffer != NULL);
+  MyData *const restrict A = buffer;
+  MyData *const restrict B = A + size;
+  MyData sum_cpu = (MyData)0;
+  
+ #pragma omp parallel for simd reduction(+: sum_cpu)
+  for (int i=0 ; i<size ; i++)
+    {
+      A[i] = rand() % N_PER_DEV;
+      B[i] = rand() % N_PER_DEV;
+      sum_cpu += A[i] + B[i];
+    }
+
+  MyData sum_gpu = (MyData)0;
+
+#pragma omp parallel num_threads(NumDev) reduction(task, +:sum_gpu)
+  {
+   #pragma omp single
+   {
+     if (NumDev != omp_get_num_threads())
+       exit(EXIT_FAILURE);
+     else
+     {
+       printf("\n\t Using %d GPUs \n", NumDev);
+       fflush(stdout);
+      }
+    } // implicit barrier
+
+    const int tid    = omp_get_thread_num();
+    const int offset = (tid * N_PER_DEV);
+    
+   #pragma omp target   					 \
+	       map(to: A[offset:N_PER_DEV], B[offset:N_PER_DEV]) \
+               device(tid)                                       \
+               in_reduction(+: sum_gpu)
+    #pragma omp loop reduction(+: sum_gpu)
+    for (int i=offset ; i<(offset + N_PER_DEV) ; i++)
+      sum_gpu += (A[i] + B[i]);
+  } // omp parallel
+
+  if (sum_cpu == sum_gpu)
+    printf("\n\t Result OK \n");
+  else
+    printf("\n\t Result wrong \n");
+  
+  free(buffer);
+  
+  return 0;
+}