diff --git a/cuda-omp/omp/miscellaneous/multiple_devices.c b/cuda-omp/omp/miscellaneous/multiple_devices.c
new file mode 100644
index 0000000000000000000000000000000000000000..4eb5cad58acadad0a75af467a9f68d45e7096a41
--- /dev/null
+++ b/cuda-omp/omp/miscellaneous/multiple_devices.c
@@ -0,0 +1,132 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+#include <assert.h>
+
+typedef int MyData;
+#define N_PER_DEV   1000000
+#define BLOCKSIZE   256
+
+#if (BLOCKSIZE < 32) || (BLOCKSIZE > 1024)
+#error "32 <= BLOCKSIZE <= 1024"
+#endif
+
+#if (N_PER_DEV < BLOCKSIZE)
+#error "N_PER_DEV < BLOCKSIZE"
+#endif
+
+#define NDEBUG
+
+void check(const MyData *const restrict vector_cpu,
+	   const MyData *const restrict vector_gpu,
+	   const size_t                 size)
+{
+  int flag = 0;
+  for (size_t i=0 ; i<size ; i++)
+{
+#if !defined(NDEBUG)
+    printf("\n\t vector_cpu[%zu] = %d - vector_gpu[%zu] = %d",
+           i, vector_cpu[i], i, vector_gpu[i]);
+#endif
+
+    flag = ((vector_cpu[i] != vector_gpu[i]) ? 1 : flag);
+}
+
+  if (flag)
+    printf("\n\t Result wrong \n");
+  else
+    printf("\n\t Result OK \n");
+
+  return;
+}
+
+void VectorAdd(const MyData *const restrict A,
+	       const MyData *const restrict B,
+	             MyData *const restrict C,
+	       const int                    offset,
+	       const int                    size,
+               const int                    dev,
+               const int                    nblocks)
+{
+ #pragma omp target                                                  \
+   teams num_teams(nblocks) thread_limit(BLOCKSIZE)                  \
+   map(to: A[offset:size], B[offset:size]) map(from: C[offset:size]) \
+   device(dev)
+  {
+    const int team  = omp_get_team_num();
+    const int team_start_index = (team * BLOCKSIZE) + offset;
+    const int team_end_index   = team_start_index + BLOCKSIZE;
+
+    #pragma omp parallel num_threads(BLOCKSIZE)
+    {
+      const int localID = omp_get_thread_num();
+      const int block   = omp_get_num_threads();
+
+      int globalID = team_start_index + localID;
+
+     for (int index=globalID ; index<team_end_index ; index+=block)
+      	C[index] = A[index] + B[index];	
+
+#if !defined(NDEBUG)
+
+      if ((localID == 0) && (team == 0))
+	printf("\n\t Device: %d - Teams: %d [requested: %d]- Thread per team: %d [requested: %d]",
+	       dev, omp_get_num_teams(), nblocks, block, BLOCKSIZE);
+#endif
+    } // omp parallel
+  } // omp target
+
+  return;
+}
+
+int main()
+{
+  // get the number of the available devices
+  const int NumDev = omp_get_num_devices();
+
+  // global vector size
+  const int size = (NumDev * N_PER_DEV);
+  assert(size > 0);
+
+  MyData *buffer = (MyData *)malloc(4 * size * sizeof(MyData));
+  assert(buffer != NULL);
+  MyData *const restrict A     = buffer;
+  MyData *const restrict B     = A + size;
+  MyData *const restrict C_CPU = B + size;
+  MyData *const restrict C_GPU = C_CPU + size;
+
+  #pragma omp parallel for simd
+  for (int i=0 ; i<size ; i++)
+    {
+      A[i] = rand() % N_PER_DEV;
+      B[i] = rand() % N_PER_DEV;
+      C_CPU[i] = A[i] + B[i];
+    }
+  
+  #pragma omp parallel num_threads(NumDev)
+  {
+    // check
+    #pragma omp single
+    {
+      if (NumDev != omp_get_num_threads())
+	exit(EXIT_FAILURE);
+      else
+	{
+	  printf("\n\t Using %d GPUs \n", NumDev);
+	  fflush(stdout);
+	}
+    } // implicit barrier
+    
+    const int tid    = omp_get_thread_num();
+    const int offset = (tid * N_PER_DEV);
+    const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE);
+
+    VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks);
+  } // omp parallel
+
+  check(C_CPU, C_GPU, size);
+  
+  free(buffer);
+  
+  return 0;
+}