diff --git a/cuda-omp/cuda/energy/Makefile b/cuda-omp/cuda/energy/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..1c5cde978ab092b36b2b2a50ee4e5c732a655822
--- /dev/null
+++ b/cuda-omp/cuda/energy/Makefile
@@ -0,0 +1,30 @@
+ENERGY ?= YES
+
+ifeq ($(ENERGY), YES)
+OPTIONS += -D_ENERGY_PMT_
+
+# enable RAPL
+OPTIONS += -D_ENERGY_RAPL_
+# enable NVIDIA
+OPTIONS += -D_ENERGY_NVIDIA_
+# enable AMD
+OPTIONS += # -D_ENERGY_AMD_
+endif
+
+NVCPP = nvc++
+OPT   = -O3
+INC   = /home/darkenergy/software/pmt/include
+LIB   = /home/darkenergy/software/pmt/lib -lpmt -lm
+
+all: mat_mult
+.PHONY: clean test
+
+mat_mult: mat_mult_block.cu energy/energy_pmt_methods.cpp energy/energy_pmt.h energy/energy_pmt_methods.h Makefile
+	$(NVCPP) $(OPT) $(OPTIONS) -I$(INC) mat_mult_block.cu energy/energy_pmt_methods.cpp -o $@ -L$(LIB)
+	ldd ./mat_mult
+
+test: mat_mult
+	./mat_mult
+
+clean:
+	rm -rf *.o mat_mult *~ energy/*~ energy/*.o
diff --git a/cuda-omp/cuda/energy/energy/energy_pmt.h b/cuda-omp/cuda/energy/energy/energy_pmt.h
new file mode 100644
index 0000000000000000000000000000000000000000..2204ca2d11eb194ec691526710689eeff221b217
--- /dev/null
+++ b/cuda-omp/cuda/energy/energy/energy_pmt.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include "energy_pmt_methods.h"
+
+#if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+   #define PMT_CREATE(devID, numGPUs) Create_PMT((devID), (numGPUs))
+#else
+   #define PMT_CREATE(numGPUs)
+#endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+
+#if defined(_ENERGY_RAPL_)
+   #define PMT_CPU_START(string) Start_PMT_CPU((string))
+   #define PMT_CPU_STOP(string)  Stop_PMT_CPU((string))
+   #define PMT_CPU_SHOW(string)  Show_PMT_CPU((string))
+#else
+   #define PMT_CPU_START(string)
+   #define PMT_CPU_STOP(string)
+   #define PMT_CPU_SHOW(string)
+#endif // _ENERGY_RAPL_
+
+#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+   #define PMT_GPU_START(string, devID) Start_PMT_GPU((string), (devID))
+   #define PMT_GPU_STOP(string, devID)  Stop_PMT_GPU((string), (devID))
+   #define PMT_GPU_SHOW(string)  Show_PMT_GPU((string))
+#else
+   #define PMT_GPU_START(string, dev)
+   #define PMT_GPU_STOP(string, dev)
+   #define PMT_GPU_SHOW(string)
+#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+
diff --git a/cuda-omp/cuda/energy/energy/energy_pmt_methods.cpp b/cuda-omp/cuda/energy/energy/energy_pmt_methods.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f41101d24385d703c45c5948e6cdb67ced827471
--- /dev/null
+++ b/cuda-omp/cuda/energy/energy/energy_pmt_methods.cpp
@@ -0,0 +1,269 @@
+#if defined(_ENERGY_PMT_)
+
+#include <pmt.h>
+#include <assert.h>
+#include <iostream>
+#include <vector>
+#include <string>
+#include <map>
+
+struct EnergyState
+{
+  pmt::State   start;
+  pmt::State   stop;
+  double       joules;
+  double       watts;
+  double       seconds;
+  unsigned int count;
+};
+
+static bool PMT_ERROR{false};
+
+void PMT_err(void)
+{
+  std::cout << "\n\t PMT Error \n" << std::endl;
+  
+  return;
+}
+
+#if defined(_ENERGY_RAPL_)
+   static std::unique_ptr<pmt::PMT> sensor_cpu;
+   static std::map<std::string, EnergyState> state_cpu;
+#endif // _ENERGY_RAPL_
+
+#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+   static std::map<int, std::unique_ptr<pmt::PMT>> sensor_gpu;
+   static std::map<int, std::map<std::string, EnergyState>> state_gpu;
+#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+
+#if !defined(__NVCC__) && !defined(__NVCOMPILER)
+   extern "C"
+      {
+         #include "energy_pmt_methods.h"
+      }
+#endif // !defined(__NVCC__) && !defined(__NVCOMPILER)
+
+#if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+void Create_PMT(int *devID,
+		const unsigned int numGPUs)
+{
+#if defined(_ENERGY_RAPL_)
+
+  sensor_cpu = pmt::Create("rapl");
+
+#endif // _ENERGY_RAPL_
+  
+  if ((numGPUs > 0) && (devID != nullptr))
+    {
+      for (unsigned int dev=0 ; dev<numGPUs ; dev++)
+	{
+#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+	  sensor_gpu.insert({devID[dev],
+
+#if defined(_ENERGY_NVIDIA_)
+	                     pmt::nvml::NVML::Create(dev)});
+#elif defined(_ENERGY_AMD_)
+	                     pmt::rocm::AMD::Create(dev)});
+#endif
+
+#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+	} // numGPUs
+    }
+
+  return;
+}
+#endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+
+#if defined(_ENERGY_RAPL_)
+void Start_PMT_CPU(const char *label)
+{
+  if (PMT_ERROR)
+    {
+      PMT_err();
+      return;
+    }
+
+  // check if the label already exists
+  if (state_cpu.count(std::string{label}))
+    {
+      state_cpu[std::string{label}].start = sensor_cpu->Read();
+    }
+  else
+    {
+      // insert the key and initialize the counters
+      state_cpu.insert({std::string{label},
+			{sensor_cpu->Read(),
+			 0,
+			 0.0, 0.0, 0.0, 0
+			}});
+    }
+
+  return;
+}
+
+void Stop_PMT_CPU(const char *label)
+{
+  if (PMT_ERROR)
+    {
+      PMT_err();
+      return;
+    }
+  
+  // check if the label already exists
+  // if not error
+  if (!state_cpu.count(std::string{label}))
+    {
+      PMT_ERROR = true;
+      PMT_err();
+      return;
+    }
+  else
+    {
+      // read the counter
+      state_cpu[std::string{label}].stop = sensor_cpu->Read();
+
+      // update quantities
+      state_cpu[std::string{label}].seconds +=
+	sensor_cpu->seconds(state_cpu[std::string{label}].start,
+			    state_cpu[std::string{label}].stop);
+      
+      state_cpu[std::string{label}].joules +=
+	sensor_cpu->joules(state_cpu[std::string{label}].start,
+			   state_cpu[std::string{label}].stop);
+
+      state_cpu[std::string{label}].watts +=
+	sensor_cpu->watts(state_cpu[std::string{label}].start,
+			  state_cpu[std::string{label}].stop);
+
+      state_cpu[std::string{label}].count++;
+    }
+  
+  return;
+}
+
+void Show_PMT_CPU(const char *label)
+{
+  if (PMT_ERROR || !state_cpu.count(std::string{label}))
+    {
+      PMT_err();
+      return;
+    }
+  else
+    {
+      std::cout << "\n\t CPU Kernel:" << std::string{label} << ":" << std::endl;
+      std::cout << "\t\t" << state_cpu[std::string{label}].seconds << " [S]" << std::endl;
+      std::cout << "\t\t" << state_cpu[std::string{label}].joules  << " [J]" << std::endl;
+      std::cout << "\t\t" << state_cpu[std::string{label}].watts / state_cpu[std::string{label}].count  << " [W]" << "\n" << std::endl;
+    }
+  
+  return;
+}
+#endif // _ENERGY_RAPL_
+
+#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+void Start_PMT_GPU(const char *label,
+		   const int  devID)
+{
+  if (PMT_ERROR || !sensor_gpu.count(devID))
+    {
+      PMT_err();
+      return;
+    }
+
+  // check if the devID already exists
+  if (!state_gpu.count(devID))
+    {
+      // insert devID
+      state_gpu.insert({devID, {}});
+    }
+  // check if the label already exists
+  if (state_gpu[devID].count(std::string{label}))
+    {
+      // read the sensor
+      state_gpu[devID][std::string{label}].start = sensor_gpu[devID]->Read();
+    }
+  else
+    {
+      // insert the label and initialize the counters
+      state_gpu[devID].insert({std::string{label},
+			       {
+				 sensor_gpu[devID]->Read(),
+				 0,
+				 0.0, 0.0, 0.0, 0
+			       }});
+    }
+
+  return;
+}
+
+void Stop_PMT_GPU(const char *label,
+		  const int   devID)
+{
+  // check if the devID already exists
+  // if not error
+  if (!state_gpu.count(devID) || PMT_ERROR || !sensor_gpu.count(devID))
+    {
+      PMT_ERROR = true;
+      PMT_err();
+      return;
+    }
+  else
+    {
+      // check if the label already exists
+      // if not error
+      if (!state_gpu[devID].count(std::string{label}))
+	{
+	  PMT_ERROR = true;
+	  PMT_err();
+	  return;
+	}
+      else
+	{
+	  // read the counter
+	  state_gpu[devID][std::string{label}].stop = sensor_gpu[devID]->Read();
+
+	  // update quantities
+	  state_gpu[devID][std::string{label}].seconds +=
+	    sensor_gpu[devID]->seconds(state_gpu[devID][std::string{label}].start,
+				       state_gpu[devID][std::string{label}].stop);
+      
+	  state_gpu[devID][std::string{label}].joules +=
+	    sensor_gpu[devID]->joules(state_gpu[devID][std::string{label}].start,
+				      state_gpu[devID][std::string{label}].stop);
+
+	  state_gpu[devID][std::string{label}].watts +=
+	    sensor_gpu[devID]->watts(state_gpu[devID][std::string{label}].start,
+				     state_gpu[devID][std::string{label}].stop);
+      
+	  state_gpu[devID][std::string{label}].count++;
+	}
+    }
+  
+  return;
+}
+
+void Show_PMT_GPU(const char *label)
+{
+  if (PMT_ERROR)
+    {
+      PMT_err();
+      return;
+    }
+  else
+    {
+      // show quantities for all devices
+      for (const auto& [key, value]: state_gpu)
+	{
+	  std::cout << "\n\t GPU [" << key << "] kernel:" << std::string{label} << ":" << std::endl;
+	  std::cout << "\t\t" << value.at(std::string{label}).seconds << " [s]" << std::endl;
+	  std::cout << "\t\t" << value.at(std::string{label}).joules  << " [J]" << std::endl;
+	  std::cout << "\t\t" << value.at(std::string{label}).watts / value.at(std::string{label}).count  << " [W]" << "\n" << std::endl;
+	}
+    }
+  
+  return;
+}
+
+#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+
+#endif // _ENERGY_PMT_
diff --git a/cuda-omp/cuda/energy/energy/energy_pmt_methods.h b/cuda-omp/cuda/energy/energy/energy_pmt_methods.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b49d915972463850e076ab3aa063a7caa11174e
--- /dev/null
+++ b/cuda-omp/cuda/energy/energy/energy_pmt_methods.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+   void Create_PMT(int *devID, const unsigned int numGPUs);
+#endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+
+#if defined(_ENERGY_RAPL_)
+   void Start_PMT_CPU(const char *string);
+   void Stop_PMT_CPU(const char *string);
+   void Show_PMT_CPU(const char *string);
+#endif // _ENERGY_RAPL_
+
+#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+   void Start_PMT_GPU(const char *string, const int);
+   void Stop_PMT_GPU(const char *string, const int);
+   void Show_PMT_GPU(const char *string);
+#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+
+
diff --git a/cuda-omp/cuda/energy/mat_mult_block.cu b/cuda-omp/cuda/energy/mat_mult_block.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5e47928c3a6de1b58567d847537909743bc2bc75
--- /dev/null
+++ b/cuda-omp/cuda/energy/mat_mult_block.cu
@@ -0,0 +1,326 @@
+////////////////////////////////////////////////////////////////////////////////////////////////
+// - Block matrix multiplication algorithm
+//
+//   const size_t Nblocks = (N / Bsize);
+//
+//   // loop over blocks of matrix C
+//   for (size_t ib=0 ; ib<Nblocks ; ib++)
+//   {
+//      for (size_t jb=0 ; jb<Nblocks ; jb++)
+//      {
+//        
+//         // loop over blocks of rows of A       
+//         for (size_t kb=0 ; kb<Nblocks ; kb++)
+//         {
+//
+//           // Matrix multiplication within a block
+//           for (size_t i=(ib * Bsize) ; i<((ib + 1) * Bsize) ; i++)
+//             for (size_t j=(jb * Bsize) ; j<((jb + 1) * Bsize) ; j++)
+//               for (size_t k=(kb * Bsize) ; k<((kb + 1) * Bsize) ; k++)
+//                  C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
+//
+//         } // kb
+//      } // jb
+//   } // ib
+//
+// - Exploit GPU shared-memory.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 22.08.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc++ mat_mult_block.cu -o mat_mult_block
+// - Run the code:
+//   $ ./mat_mult_block
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <assert.h>
+#include <cuda.h>
+#include <string.h>
+#include <math.h>
+#include <float.h>
+
+#include "energy/energy_pmt.h"
+
+#define N                    1024
+#define SIZE                 (N * N) // matrix size
+typedef double MyData;               // do not change
+#define BLOCK                16
+
+// sanity check
+#if BLOCK * BLOCK > 1024
+#error BLOCKSIZE must be <= 1024
+#endif
+
+#if BLOCK * BLOCK > SIZE
+#error BLOCKSIZE must be <= SIZE
+#endif
+
+#define LOOP 100
+#define NDEBUG
+
+double wall_time()
+{
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);
+  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
+
+  return ret;
+}
+
+void CPU_mat_mult(const MyData *const __restrict__ A,
+		  const MyData *const __restrict__ B,
+	                MyData *const __restrict__ C,
+		  const size_t                 size)
+{
+  for (size_t i=0 ; i<size ; i++)
+    for (size_t j=0 ; j<size ; j++)
+      for (size_t k=0 ; k<size ; k++)
+	C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);
+
+  return;
+}
+
+void CPU_mat_mult_block(const MyData *const __restrict__ A,
+			const MyData *const __restrict__ B,
+	                      MyData *const __restrict__ C,
+			const size_t                 size)
+{
+  const size_t Nblocks = (size / BLOCK);
+
+  // loop over blocks of matrix C
+  for (size_t ib=0 ; ib<Nblocks ; ib++)
+    {
+      for (size_t jb=0 ; jb<Nblocks ; jb++)
+	{       
+	  // loop over blocks of rows of A       
+	  for (size_t kb=0 ; kb<Nblocks ; kb++)
+	    {
+	      // Matrix multiplication within a block
+	      for (size_t i=(ib * BLOCK) ; i<((ib + 1) * BLOCK) ; i++)
+		for (size_t j=(jb * BLOCK) ; j<((jb + 1) * BLOCK) ; j++)
+		  for (size_t k=(kb * BLOCK) ; k<((kb + 1) * BLOCK) ; k++)
+		    C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
+	    } // kb
+	} // jb
+    } // ib
+  
+  return;
+}
+
+__global__ void GPU_mat_mult_block(const MyData *const __restrict__ A,
+				   const MyData *const __restrict__ B,
+			                 MyData *const __restrict__ C,
+				   const size_t                 size)
+{
+  const size_t size2 = (size * size);
+  
+  const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (globalID >= size2)
+    return;
+  
+  const size_t Nblocks  = size / BLOCK;           // number of blocks to loop over A and B
+  const size_t ib       = blockIdx.x / Nblocks;   // indexes of starting matrix's block
+  const size_t jb       = blockIdx.x % Nblocks;
+  const size_t i_local  = threadIdx.x / BLOCK;    // local matrix's indexes mapped to each CUDA thread
+  const size_t j_local  = threadIdx.x % BLOCK;    // within its own block
+  const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
+  const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices
+  
+  C[(i_global * size) + j_global] = (MyData)0;
+
+  // loop over blocks
+  for (size_t block=0 ; block<Nblocks ; block++)
+    {
+      const size_t j_A = (block * BLOCK);
+      const size_t i_B = (block * BLOCK);
+
+      // perform the matrix multiplication within the block
+      MyData value = (MyData)0;
+      for (size_t k=0 ; k<BLOCK ; k++)
+	value += A[(i_global * size) + k + j_A] * B[((k + i_B) * size) + j_global];
+
+      C[(i_global * size) + j_global] += value;
+    }
+
+  return;
+}
+
+__global__ void GPU_mat_mult_block_shared(const MyData *const __restrict__ A,
+					  const MyData *const __restrict__ B,
+					        MyData *const __restrict__ C,
+					  const size_t                 size)
+{
+  __shared__ MyData Ablock[BLOCK * BLOCK];
+  __shared__ MyData Bblock[BLOCK * BLOCK];
+  __shared__ MyData Cblock[BLOCK * BLOCK];
+  
+  const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
+  const size_t size2 = (size * size);
+  if (globalID >= size2)
+    return;
+  
+  const size_t Nblocks  = size / BLOCK;           // number of blocks to loop over
+  const size_t ib       = blockIdx.x / Nblocks;   // indexes of starting matrix's block
+  const size_t jb       = blockIdx.x % Nblocks;
+  const size_t i_local  = threadIdx.x / BLOCK;    // local matrix's indexes mapped to each CUDA thread
+  const size_t j_local  = threadIdx.x % BLOCK;    // within its own block
+  const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
+  const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices
+
+  // Init Cblock
+  Cblock[(i_local * BLOCK) + j_local] = (MyData)0;
+
+  // loop over blocks
+  for (size_t block=0 ; block<Nblocks ; block++)
+    {
+      const size_t j_A = (block * BLOCK);
+      const size_t i_B = (block * BLOCK);
+
+      // waits until all threads in the thread block have reached this point and shared memory accesses
+      // made by these threads prior to __syncthreads() are visible to all threads in the block.
+      __syncthreads();
+      
+      // copy block of A into shared memory
+      Ablock[(i_local * BLOCK) + j_local] = A[(i_global * size) + j_local + j_A];
+      // copy block of B into shared memory
+      Bblock[(i_local * BLOCK) + j_local] = B[((i_local + i_B) * size) + j_global];
+      
+      // waits until all threads in the thread block have reached this point and shared memory accesses      // made by these threads prior to __syncthreads() are visible to all threads in the block.
+      __syncthreads();
+
+      // perform the matrix multiplication within the block
+      MyData value = (MyData)0;
+      for (size_t k=0 ; k<BLOCK ; k++)
+	value += Ablock[(i_local * BLOCK) + k] * Bblock[(k * BLOCK) + j_local];
+
+      // store the partial result in shared memory
+      Cblock[(i_local * BLOCK) + j_local] += value;
+    }
+
+  C[(i_global * size) + j_global] = Cblock[(i_local * BLOCK) + j_local];
+  
+  return;
+}
+
+void check(const MyData *const __restrict__ cpu_matrix,
+	   const MyData *const __restrict__ gpu_matrix)
+{
+  int flag = 0;
+  for (size_t i=0 ; i<SIZE ; i++)
+    flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);
+
+  if (!flag)
+    printf("\n\t Result OK");
+  else
+    printf("\n\t Result wrong");
+  
+  return;
+}
+
+int main()
+{
+  double time;
+  MyData *buffer_cpu = (MyData *)calloc(4 * SIZE, sizeof(MyData));
+  assert(buffer_cpu != NULL);
+
+  // host reference matrix A
+  MyData *const __restrict__ A_CPU = buffer_cpu;
+  MyData *const __restrict__ B_CPU = A_CPU + SIZE;
+  MyData *const __restrict__ C_CPU = B_CPU + SIZE;
+  MyData *const __restrict__ C_GPU_host = C_CPU + SIZE;
+  for (size_t i=0 ; i<SIZE ; i++)
+    {
+      A_CPU[i] = drand48();
+      B_CPU[i] = drand48();
+    }
+
+  // Get the number of compute-capable devices
+  int NumGPUs{-1};
+  cudaGetDeviceCount(&NumGPUs);
+  if (NumGPUs <= 0)
+    {
+      printf("\n\t Nvidia-GPU is not available \n\n");
+      return EXIT_FAILURE;
+    }
+
+  int devID = 0;
+  // Init PMT
+  PMT_CREATE(&devID, 1);
+  
+  ////////////////////////// CPU naive algorithm //////////////////////////////////////////
+  PMT_CPU_START("CPU_mat_mult_block");
+  CPU_mat_mult_block(A_CPU, B_CPU, C_CPU, N);
+  PMT_CPU_STOP("CPU_mat_mult_block");
+  /////////////////////////////////////////////////////////////////////////////////////////
+  
+  // copy/alloc data to the GPU
+  MyData *buffer_gpu = NULL;
+  cudaMalloc((void **)&buffer_gpu, (3 * SIZE * sizeof(MyData)));
+  MyData *const A_GPU = buffer_gpu;
+  MyData *const B_GPU = A_GPU + SIZE;
+  MyData *const C_GPU = B_GPU + SIZE;
+  cudaMemcpy(A_GPU, A_CPU, (2 * SIZE * sizeof(MyData)), cudaMemcpyHostToDevice);
+
+  const dim3 nblocks = {(SIZE + (BLOCK * BLOCK)  -1)/(BLOCK * BLOCK), 1, 1};
+  const dim3 block   = {(BLOCK * BLOCK), 1, 1};
+  
+  /////////////////////////// GPU naive block algorithm ////////////////////////////////////////
+  GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
+  cudaDeviceSynchronize();
+  time = 0.0;
+  PMT_GPU_START("GPU_mat_mult_block", 0);
+  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+    {
+      const double start = wall_time();
+      GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
+      cudaDeviceSynchronize();
+      time += (wall_time() - start);
+    }
+  PMT_GPU_STOP("GPU_mat_mult_block", 0);
+  cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
+  
+  check(C_CPU, C_GPU_host);
+  printf("\n\t GPU naive block time %lg [s]\n", (time / LOOP));
+  /////////////////////////////////////////////////////////////////////////////////////
+
+  /////////////////////////// GPU block shared memory algorithm ///////////////////////
+  GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
+  cudaDeviceSynchronize();
+  time = 0.0;
+  PMT_GPU_START("GPU_mat_mult_block_shared", 0);
+  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+    {
+      const double start = wall_time();
+      GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
+      cudaDeviceSynchronize();
+      time += (wall_time() - start);
+    }
+  PMT_GPU_STOP("GPU_mat_mult_block_shared", 0);
+  cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
+  
+  check(C_CPU, C_GPU_host);
+  printf("\n\t GPU block shared memory time %lg [s]\n", (time / LOOP));
+  /////////////////////////////////////////////////////////////////////////////////////
+
+  // free CPU memory
+  free(buffer_cpu);
+  // free GPU memory
+  cudaFree(buffer_gpu);
+
+  printf("\n");
+
+  PMT_CPU_SHOW("CPU_mat_mult_block");
+  PMT_GPU_SHOW("GPU_mat_mult_block");
+  PMT_GPU_SHOW("GPU_mat_mult_block_shared");
+  
+  return EXIT_SUCCESS;
+}
diff --git a/cuda-omp/omp/energy/Makefile b/cuda-omp/omp/energy/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..56ef87ef1e4d251627b0ac6dad79d680582aaaa3
--- /dev/null
+++ b/cuda-omp/omp/energy/Makefile
@@ -0,0 +1,38 @@
+ENERGY ?= YES
+
+ifeq ($(ENERGY), YES)
+OPTIONS += -D_ENERGY_PMT_
+
+# enable RAPL
+OPTIONS += -D_ENERGY_RAPL_
+# enable NVIDIA
+OPTIONS += -D_ENERGY_NVIDIA_
+# enable AMD
+OPTIONS += # -D_ENERGY_AMD_
+endif
+
+CC  = gcc
+CPP = g++
+NVC = nvc++
+OPT = -Wall -Wextra -O3 -fopenmp -march=native
+INC = /home/darkenergy/software/pmt/include
+LIB = /home/darkenergy/software/pmt/lib -lpmt -lm
+
+all: prog
+.PHONY: clean test
+
+mat_mult_block.o: mat_mult_block.cu energy/energy_pmt.h Makefile
+	$(NVC) -O3 $(OPTIONS) $< -c $@
+
+energy_pmt_methods.o: energy/energy_pmt_methods.cpp energy/energy_pmt_methods.h energy/energy_pmt.h Makefile
+	$(CPP) $(OPTIONS) -I$(INC) $< -c $@
+
+mat_mult: mat_mult_block.o energy_pmt_methods.o
+	$(NVC) $(OPTIONS) $^ -o $@ -L$(LIB)
+	ldd mat_mult
+
+test: mat_mult
+	./mat_mult
+
+clean:
+	rm -rf *.o mat_mult *~ energy/*~ energy/*.o
diff --git a/cuda-omp/omp/energy/energy/energy_pmt.h b/cuda-omp/omp/energy/energy/energy_pmt.h
new file mode 100644
index 0000000000000000000000000000000000000000..b59d1b14c647cd5898050474337a8dd60f9c1bd5
--- /dev/null
+++ b/cuda-omp/omp/energy/energy/energy_pmt.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include "energy_pmt_methods.h"
+
+#if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+   #define PMT_CREATE(numGPUs) Create_PMT((numGPUs))
+#else
+   #define PMT_CREATE(numGPUs)
+#endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+
+#if defined(_ENERGY_RAPL_)
+   #define PMT_CPU_START(string) Start_PMT_CPU((string))
+   #define PMT_CPU_STOP(string)  Stop_PMT_CPU((string))
+   #define PMT_CPU_SHOW(string)  Show_PMT_CPU((string))
+#else
+   #define PMT_CPU_START(string)
+   #define PMT_CPU_STOP(string)
+   #define PMT_CPU_SHOW(string)
+#endif // _ENERGY_RAPL_
+
+#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+   #define PMT_GPU_START(string) Start_PMT_GPU((string))
+   #define PMT_GPU_STOP(string)  Stop_PMT_GPU((string))
+   #define PMT_GPU_SHOW(string)  Show_PMT_GPU((string))
+#else
+   #define PMT_GPU_START(string)
+   #define PMT_GPU_STOP(string)
+   #define PMT_GPU_SHOW(string)
+#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+
diff --git a/cuda-omp/omp/energy/energy/energy_pmt_methods.cpp b/cuda-omp/omp/energy/energy/energy_pmt_methods.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a5863426e8dcf1b4dd21b0edfe6f638f2cb25306
--- /dev/null
+++ b/cuda-omp/omp/energy/energy/energy_pmt_methods.cpp
@@ -0,0 +1,157 @@
+#if defined(_ENERGY_PMT_)
+
+#include <pmt.h>
+#include <assert.h>
+#include <iostream>
+#include <vector>
+#include <string>
+#include <map>
+
+struct EnergyState
+{
+  pmt::State   start;
+  pmt::State   stop;
+  double       joules;
+  double       watts;
+  double       seconds;
+  unsigned int count;
+};
+
+static bool PMT_ERROR{false};
+
+void PMT_err(void)
+{
+  std::cout << "\n\t PMT Error \n" << std::endl;
+  
+  return;
+}
+
+#if defined(_ENERGY_RAPL_)
+   static std::unique_ptr<pmt::PMT> sensor_cpu;
+   static std::map<std::string, EnergyState> state_cpu;
+#endif // _ENERGY_RAPL_
+
+#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+   static std::vector<std::unique_ptr<pmt::PMT>> sensor_gpu;
+   static std::vector<std::map<std::string, EnergyState>> state_gpu;
+#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+
+extern "C"
+{
+#include "energy_pmt_methods.h"
+}
+
+#if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+void Create_PMT(const unsigned int numGPUs)
+{
+#if defined(_ENERGY_RAPL_)
+  
+  sensor_cpu = pmt::Create("rapl");
+
+#endif // _ENERGY_RAPL_
+  
+  if (numGPUs > 0)
+    {
+      for (unsigned int dev=0 ; dev<numGPUs ; dev++)
+	{
+#if defined(_ENERGY_NVIDIA_)
+	  sensor_gpu.push_back(pmt::nvml::NVML::Create(dev));
+#endif // _ENERGY_NVIDIA_
+
+#if defined(_ENERGY_AMD_)
+	  sensor_gpu.push_back(pmt::rocm::AMD::Create(dev));
+#endif // _ENERGY_AMD_
+	} // numGPUs
+    }
+
+  return;
+}
+#endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+
+#if defined(_ENERGY_RAPL_)
+void Start_PMT_CPU(const char *label)
+{
+  if (PMT_ERROR)
+    {
+      PMT_err();
+      return;
+    }
+
+  // check if the label already exists
+  if (state_cpu.count(std::string{label}))
+    {
+      state_cpu[std::string{label}].start = sensor_cpu->Read();
+    }
+  else
+    {
+      // insert the key and initialize the counters
+      state_cpu.insert({std::string{label},
+			{sensor_cpu->Read(),
+			 0,
+			 0.0, 0.0, 0.0, 0
+			}});
+    }
+
+  return;
+}
+
+void Stop_PMT_CPU(const char *label)
+{
+  if (PMT_ERROR)
+    {
+      PMT_err();
+      return;
+    }
+  
+  // check if the label already exists
+  // if not error
+  if (!state_cpu.count(std::string{label}))
+    {
+      PMT_ERROR = true;
+      PMT_err();
+      return;
+    }
+  else
+    {
+      // read the counter
+      state_cpu[std::string{label}].stop = sensor_cpu->Read();
+
+      // update quantities
+      state_cpu[std::string{label}].seconds +=
+	sensor_cpu->seconds(state_cpu[std::string{label}].start,
+			    state_cpu[std::string{label}].stop);
+      
+      state_cpu[std::string{label}].joules +=
+	sensor_cpu->joules(state_cpu[std::string{label}].start,
+			   state_cpu[std::string{label}].stop);
+
+      state_cpu[std::string{label}].watts +=
+	sensor_cpu->watts(state_cpu[std::string{label}].start,
+			  state_cpu[std::string{label}].stop);
+
+      state_cpu[std::string{label}].count++;
+    }
+  
+  return;
+}
+
+void Show_PMT_CPU(const char *label)
+{
+  if (PMT_ERROR || !state_cpu.count(std::string{label}))
+    {
+      PMT_err();
+      return;
+    }
+  else
+    {
+      std::cout << "\n\t Kernel:" << std::string{label} << ":" << std::endl;
+      std::cout << "\t\t" << state_cpu[std::string{label}].seconds << " [S]" << std::endl;
+      std::cout << "\t\t" << state_cpu[std::string{label}].joules  << " [J]" << std::endl;
+      std::cout << "\t\t" << state_cpu[std::string{label}].watts / state_cpu[std::string{label}].count  << " [W]" << "\n" << std::endl;
+    }
+  
+  return;
+}
+#endif // _ENERGY_RAPL_
+
+#endif // _ENERGY_PMT_
diff --git a/cuda-omp/omp/energy/energy/energy_pmt_methods.h b/cuda-omp/omp/energy/energy/energy_pmt_methods.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0479c644f11e93a4cb9d49a3826577d33bf1698
--- /dev/null
+++ b/cuda-omp/omp/energy/energy/energy_pmt_methods.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+   void Create_PMT(const unsigned int numGPUs);
+#endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+
+#if defined(_ENERGY_RAPL_)
+   void Start_PMT_CPU(const char *string);
+   void Stop_PMT_CPU(const char *string);
+   void Show_PMT_CPU(const char *string);
+#endif // _ENERGY_RAPL_
+
+#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+   void Start_PMT_GPU(const char *string);
+   void Stop_PMT_GPU(const char *string);
+   void Show_PMT_GPU(const char *string);
+#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+
+
diff --git a/cuda-omp/omp/energy/multiple_devices.c b/cuda-omp/omp/energy/multiple_devices.c
new file mode 100644
index 0000000000000000000000000000000000000000..c8212b8ac0afbd84837a7c0d11db061ed3c5a9ee
--- /dev/null
+++ b/cuda-omp/omp/energy/multiple_devices.c
@@ -0,0 +1,146 @@
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 
+// Offload to multiple devices within the same node:
+// - using one OMP host-thread per device synchronously;
+// - using one OMP host-thread asynchronously.
+//
+// Author: David Goz
+// mail  : david.goz@inaf.it
+// date  : 27.08.2024
+// code tested using nvhpc
+//
+// - Compile the code:
+//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v multiple_devices.c -o multiple_devices_omp
+// - Run the code:
+//   $ export OMP_TARGET_OFFLOAD=mandatory
+//   $ ./multiple_devices_omp
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+#include <assert.h>
+
+typedef int MyData;
+#define N_PER_DEV   1000000
+#define BLOCKSIZE   256
+
+#if (BLOCKSIZE < 32) || (BLOCKSIZE > 1024)
+#error "32 <= BLOCKSIZE <= 1024"
+#endif
+
+#if (N_PER_DEV < BLOCKSIZE)
+#error "N_PER_DEV < BLOCKSIZE"
+#endif
+
+#define TRUE  1
+#define FALSE 0
+
+#define NDEBUG
+
+void VectorAdd(const MyData *const restrict A,
+	       const MyData *const restrict B,
+	             MyData *const restrict C,
+	       const int                    offset,
+	       const int                    size,
+               const int                    dev,
+               const int                    nblocks,
+	       const int                    async)
+{
+#pragma omp target							\
+  teams num_teams(nblocks) thread_limit(BLOCKSIZE)			\
+  map(to: A[offset:size], B[offset:size]) map(from: C[offset:size])	\
+  device(dev)
+  {
+    const int team  = omp_get_team_num();
+    const int team_start_index = (team * BLOCKSIZE) + offset;
+    const int team_end_index   = team_start_index + BLOCKSIZE;
+
+#pragma omp parallel num_threads(BLOCKSIZE)
+    {
+      const int localID = omp_get_thread_num();
+      const int block   = omp_get_num_threads();
+
+      int globalID = team_start_index + localID;
+
+      for (int index=globalID ; index<team_end_index ; index+=block)
+	C[index] = A[index] + B[index];	
+
+#if !defined(NDEBUG)
+
+      if ((localID == 0) && (team == 0))
+	printf("\n\t Device: %d - Teams: %d [requested: %d]- Thread per team: %d [requested: %d]",
+	       dev, omp_get_num_teams(), nblocks, block, BLOCKSIZE);
+#endif
+    } // omp parallel
+  } // omp target
+            
+  return;
+}
+
+int main()
+{
+  // get the number of the available devices
+  const int NumDev = omp_get_num_devices();
+
+  const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE);
+  
+  // global vector size
+  const int size = (NumDev * N_PER_DEV);
+  assert(size > 0);
+
+  MyData *buffer = (MyData *)malloc(4 * size * sizeof(MyData));
+  assert(buffer != NULL);
+  MyData *const restrict A     = buffer;
+  MyData *const restrict B     = A + size;
+  MyData *const restrict C_CPU = B + size;
+  MyData *const restrict C_GPU = C_CPU + size;
+
+  #pragma omp parallel for simd
+  for (int i=0 ; i<size ; i++)
+    {
+      A[i] = rand() % N_PER_DEV;
+      B[i] = rand() % N_PER_DEV;
+      C_CPU[i] = A[i] + B[i];
+    }
+
+  // each device is managed by a single OMP thread
+  #pragma omp parallel num_threads(NumDev)
+  {
+    // check
+    #pragma omp single
+    {
+      if (NumDev != omp_get_num_threads())
+	exit(EXIT_FAILURE);
+      else
+	{
+	  printf("\n\t Using %d GPUs \n", NumDev);
+	  fflush(stdout);
+	}
+    } // implicit barrier
+    
+    const int tid    = omp_get_thread_num();
+    const int offset = (tid * N_PER_DEV);
+    
+    VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks, FALSE);
+  } // omp parallel
+
+  check(C_CPU, C_GPU, size);
+  memset(C_GPU, 0, (size * sizeof(MyData)));
+  
+  // one OMP thread manages asynchronously all the devices
+  for (int dev=0 ; dev<NumDev ; dev++)
+    {
+      const int offset = (dev * N_PER_DEV);
+      VectorAdd(A, B, C_GPU, offset, N_PER_DEV, dev, nblocks, TRUE);
+    }
+
+  // host-devices synchronization
+  #pragma omp taskwait
+  check(C_CPU, C_GPU, size);
+  
+  free(buffer);
+  
+  return 0;
+}