diff --git a/cuda-omp/cuda/energy/Makefile b/cuda-omp/cuda/energy/Makefile
index 1c5cde978ab092b36b2b2a50ee4e5c732a655822..f9e41f3817ea0954eee2c67afdb0f2ec98c88154 100644
--- a/cuda-omp/cuda/energy/Makefile
+++ b/cuda-omp/cuda/energy/Makefile
@@ -9,22 +9,36 @@ OPTIONS += -D_ENERGY_RAPL_
 OPTIONS += -D_ENERGY_NVIDIA_
 # enable AMD
 OPTIONS += # -D_ENERGY_AMD_
+
+PMT = /leonardo/home/userexternal/dgoz0000/lib/pmt/local
+# PMT   = /home/darkenergy/software/pmt
+INC = -I$(PMT)/include
+LIB = -L$(PMT)/lib64 -lpmt -lm
+
 endif
 
-NVCPP = nvc++
+TOOLCHAIN ?= 
+
+NVCPP = nvc++ -std=c++17 # --gcc-toolchain=$(TOOLCHAIN)
 OPT   = -O3
-INC   = /home/darkenergy/software/pmt/include
-LIB   = /home/darkenergy/software/pmt/lib -lpmt -lm
+DEBUG = -O0 -g
 
 all: mat_mult
-.PHONY: clean test
+.PHONY: clean test debug
 
 mat_mult: mat_mult_block.cu energy/energy_pmt_methods.cpp energy/energy_pmt.h energy/energy_pmt_methods.h Makefile
-	$(NVCPP) $(OPT) $(OPTIONS) -I$(INC) mat_mult_block.cu energy/energy_pmt_methods.cpp -o $@ -L$(LIB)
+	$(NVCPP) $(OPT) $(OPTIONS) $(INC) mat_mult_block.cu energy/energy_pmt_methods.cpp -o $@ $(LIB)
 	ldd ./mat_mult
 
+mat_mult_debug: mat_mult_block.cu energy/energy_pmt_methods.cpp energy/energy_pmt.h energy/energy_pmt_methods.h Makefile
+	$(NVCPP) $(DEBUG) $(OPTIONS) $(INC) mat_mult_block.cu energy/energy_pmt_methods.cpp -o $@ $(LIB)
+	ldd ./mat_mult_debug
+
 test: mat_mult
 	./mat_mult
 
+debug: mat_mult_debug
+	cuda-gdb ./$<
+
 clean:
-	rm -rf *.o mat_mult *~ energy/*~ energy/*.o
+	rm -rf *.o mat_mult mat_mult_debug *~ energy/*~ energy/*.o
diff --git a/cuda-omp/cuda/energy/energy/energy_pmt.h b/cuda-omp/cuda/energy/energy/energy_pmt.h
index 2204ca2d11eb194ec691526710689eeff221b217..8a477305f904485189f589f0c07fc12575478a5d 100644
--- a/cuda-omp/cuda/energy/energy/energy_pmt.h
+++ b/cuda-omp/cuda/energy/energy/energy_pmt.h
@@ -5,7 +5,7 @@
 #if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
    #define PMT_CREATE(devID, numGPUs) Create_PMT((devID), (numGPUs))
 #else
-   #define PMT_CREATE(numGPUs)
+   #define PMT_CREATE(devID, numGPUs)
 #endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
 
 #if defined(_ENERGY_RAPL_)
@@ -23,8 +23,8 @@
    #define PMT_GPU_STOP(string, devID)  Stop_PMT_GPU((string), (devID))
    #define PMT_GPU_SHOW(string)  Show_PMT_GPU((string))
 #else
-   #define PMT_GPU_START(string, dev)
-   #define PMT_GPU_STOP(string, dev)
+   #define PMT_GPU_START(string, devID)
+   #define PMT_GPU_STOP(string, devID)
    #define PMT_GPU_SHOW(string)
 #endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
 
diff --git a/cuda-omp/cuda/energy/energy/energy_pmt_methods.cpp b/cuda-omp/cuda/energy/energy/energy_pmt_methods.cpp
index f41101d24385d703c45c5948e6cdb67ced827471..d7a82fa5ec77bac1f2026fb8963480a58e845089 100644
--- a/cuda-omp/cuda/energy/energy/energy_pmt_methods.cpp
+++ b/cuda-omp/cuda/energy/energy/energy_pmt_methods.cpp
@@ -49,7 +49,7 @@ void Create_PMT(int *devID,
 {
 #if defined(_ENERGY_RAPL_)
 
-  sensor_cpu = pmt::Create("rapl");
+  sensor_cpu = pmt::rapl::Rapl::Create();
 
 #endif // _ENERGY_RAPL_
   
@@ -61,9 +61,9 @@ void Create_PMT(int *devID,
 	  sensor_gpu.insert({devID[dev],
 
 #if defined(_ENERGY_NVIDIA_)
-	                     pmt::nvml::NVML::Create(dev)});
+	                     pmt::nvml::NVML::Create(devID[dev])});
 #elif defined(_ENERGY_AMD_)
-	                     pmt::rocm::AMD::Create(dev)});
+	                     pmt::rocm::AMD::Create(devID[dev])});
 #endif
 
 #endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
@@ -83,19 +83,25 @@ void Start_PMT_CPU(const char *label)
       return;
     }
 
+  const std::string tag{std::string{label}};
+
   // check if the label already exists
-  if (state_cpu.count(std::string{label}))
+  if (state_cpu.count(tag))
     {
-      state_cpu[std::string{label}].start = sensor_cpu->Read();
+      state_cpu[tag].start = sensor_cpu->Read();
     }
   else
     {
+      // create new EnergyState
+      const EnergyState newState{sensor_cpu->Read(),
+                                 static_cast<pmt::State>(0),
+                                 static_cast<double>(0),
+                                 static_cast<double>(0),
+                                 static_cast<double>(0),
+                                 static_cast<unsigned int>(0)};
+
       // insert the key and initialize the counters
-      state_cpu.insert({std::string{label},
-			{sensor_cpu->Read(),
-			 0,
-			 0.0, 0.0, 0.0, 0
-			}});
+      state_cpu.insert(std::pair<std::string, EnergyState>(tag, newState));
     }
 
   return;
@@ -108,10 +114,12 @@ void Stop_PMT_CPU(const char *label)
       PMT_err();
       return;
     }
+
+  const std::string tag{std::string{label}};
   
   // check if the label already exists
   // if not error
-  if (!state_cpu.count(std::string{label}))
+  if (!state_cpu.count(tag))
     {
       PMT_ERROR = true;
       PMT_err();
@@ -119,23 +127,20 @@ void Stop_PMT_CPU(const char *label)
     }
   else
     {
+      // get the energy state
+      EnergyState &State = state_cpu[tag];
+      
       // read the counter
-      state_cpu[std::string{label}].stop = sensor_cpu->Read();
+      State.stop = sensor_cpu->Read();
 
       // update quantities
-      state_cpu[std::string{label}].seconds +=
-	sensor_cpu->seconds(state_cpu[std::string{label}].start,
-			    state_cpu[std::string{label}].stop);
+      State.seconds += sensor_cpu->seconds(State.start, State.stop);
       
-      state_cpu[std::string{label}].joules +=
-	sensor_cpu->joules(state_cpu[std::string{label}].start,
-			   state_cpu[std::string{label}].stop);
+      State.joules  += sensor_cpu->joules(State.start, State.stop);
 
-      state_cpu[std::string{label}].watts +=
-	sensor_cpu->watts(state_cpu[std::string{label}].start,
-			  state_cpu[std::string{label}].stop);
+      State.watts   += sensor_cpu->watts(State.start, State.stop);
 
-      state_cpu[std::string{label}].count++;
+      State.count++;
     }
   
   return;
@@ -143,17 +148,19 @@ void Stop_PMT_CPU(const char *label)
 
 void Show_PMT_CPU(const char *label)
 {
-  if (PMT_ERROR || !state_cpu.count(std::string{label}))
+  const std::string tag{std::string{label}};
+  
+  if (PMT_ERROR || !state_cpu.count(tag))
     {
       PMT_err();
       return;
     }
   else
     {
-      std::cout << "\n\t CPU Kernel:" << std::string{label} << ":" << std::endl;
-      std::cout << "\t\t" << state_cpu[std::string{label}].seconds << " [S]" << std::endl;
-      std::cout << "\t\t" << state_cpu[std::string{label}].joules  << " [J]" << std::endl;
-      std::cout << "\t\t" << state_cpu[std::string{label}].watts / state_cpu[std::string{label}].count  << " [W]" << "\n" << std::endl;
+      std::cout << "\n\t CPU Kernel:" << tag << ":" << std::endl;
+      std::cout << "\t\t" << state_cpu[tag].seconds << " [S]" << std::endl;
+      std::cout << "\t\t" << state_cpu[tag].joules  << " [J]" << std::endl;
+      std::cout << "\t\t" << state_cpu[tag].watts / state_cpu[tag].count  << " [W]" << "\n" << std::endl;
     }
   
   return;
@@ -174,23 +181,28 @@ void Start_PMT_GPU(const char *label,
   if (!state_gpu.count(devID))
     {
       // insert devID
-      state_gpu.insert({devID, {}});
+      state_gpu.insert(std::pair<int, std::map<std::string, EnergyState>>(devID, {}));
     }
+
+  const std::string tag{std::string{label}};
+  
   // check if the label already exists
-  if (state_gpu[devID].count(std::string{label}))
+  if (state_gpu[devID].count(tag))
     {
       // read the sensor
-      state_gpu[devID][std::string{label}].start = sensor_gpu[devID]->Read();
+      state_gpu[devID][tag].start = sensor_gpu[devID]->Read();
     }
   else
     {
       // insert the label and initialize the counters
-      state_gpu[devID].insert({std::string{label},
-			       {
-				 sensor_gpu[devID]->Read(),
-				 0,
-				 0.0, 0.0, 0.0, 0
-			       }});
+      const EnergyState newState{sensor_gpu[devID]->Read(),
+                                 static_cast<pmt::State>(0),
+                                 static_cast<double>(0),
+                                 static_cast<double>(0),
+                                 static_cast<double>(0),
+                                 static_cast<unsigned int>(0)};
+
+      state_gpu[devID].insert(std::pair<std::string, EnergyState>(tag, newState));      
     }
 
   return;
@@ -209,9 +221,11 @@ void Stop_PMT_GPU(const char *label,
     }
   else
     {
+      const std::string tag{std::string{label}};
+      
       // check if the label already exists
       // if not error
-      if (!state_gpu[devID].count(std::string{label}))
+      if (!state_gpu[devID].count(tag))
 	{
 	  PMT_ERROR = true;
 	  PMT_err();
@@ -219,23 +233,25 @@ void Stop_PMT_GPU(const char *label,
 	}
       else
 	{
+	  EnergyState &State = state_gpu[devID][tag];
+	  
 	  // read the counter
-	  state_gpu[devID][std::string{label}].stop = sensor_gpu[devID]->Read();
+	  State.stop = sensor_gpu[devID]->Read();
 
 	  // update quantities
-	  state_gpu[devID][std::string{label}].seconds +=
-	    sensor_gpu[devID]->seconds(state_gpu[devID][std::string{label}].start,
-				       state_gpu[devID][std::string{label}].stop);
+	  State.seconds +=
+	    sensor_gpu[devID]->seconds(State.start,
+				       State.stop);
       
-	  state_gpu[devID][std::string{label}].joules +=
-	    sensor_gpu[devID]->joules(state_gpu[devID][std::string{label}].start,
-				      state_gpu[devID][std::string{label}].stop);
+	  State.joules +=
+	    sensor_gpu[devID]->joules(State.start,
+				      State.stop);
 
-	  state_gpu[devID][std::string{label}].watts +=
-	    sensor_gpu[devID]->watts(state_gpu[devID][std::string{label}].start,
-				     state_gpu[devID][std::string{label}].stop);
+	  State.watts +=
+	    sensor_gpu[devID]->watts(State.start,
+				     State.stop);
       
-	  state_gpu[devID][std::string{label}].count++;
+	  State.count++;
 	}
     }
   
@@ -251,13 +267,17 @@ void Show_PMT_GPU(const char *label)
     }
   else
     {
-      // show quantities for all devices
-      for (const auto& [key, value]: state_gpu)
+      const std::string tag{std::string{label}};
+
+      for (const auto &[key, value]: state_gpu)
 	{
-	  std::cout << "\n\t GPU [" << key << "] kernel:" << std::string{label} << ":" << std::endl;
-	  std::cout << "\t\t" << value.at(std::string{label}).seconds << " [s]" << std::endl;
-	  std::cout << "\t\t" << value.at(std::string{label}).joules  << " [J]" << std::endl;
-	  std::cout << "\t\t" << value.at(std::string{label}).watts / value.at(std::string{label}).count  << " [W]" << "\n" << std::endl;
+	  if (value.count(tag))
+	    {
+	      std::cout << "\n\t GPU [" << key << "] kernel:" << tag << ":" << std::endl;
+	      std::cout << "\t\t" << value.at(tag).seconds << " [s]" << std::endl;
+	      std::cout << "\t\t" << value.at(tag).joules  << " [J]" << std::endl;
+	      std::cout << "\t\t" << value.at(tag).watts / value.at(tag).count  << " [W]" << "\n" << std::endl;
+	    }
 	}
     }
   
diff --git a/cuda-omp/cuda/energy/mat_mult_block.cu b/cuda-omp/cuda/energy/mat_mult_block.cu
index 5e47928c3a6de1b58567d847537909743bc2bc75..112b91c819d1b386ac1bb8eca6858b2fab31fc57 100644
--- a/cuda-omp/cuda/energy/mat_mult_block.cu
+++ b/cuda-omp/cuda/energy/mat_mult_block.cu
@@ -39,6 +39,7 @@
 //////////////////////////////////////////////////////////////////////////////////////////////////
 
 #include <stdio.h>
+#include <string.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <time.h>
@@ -51,20 +52,21 @@
 #include "energy/energy_pmt.h"
 
 #define N                    1024
-#define SIZE                 (N * N) // matrix size
+#define SIZE                 ((N) * (N)) // matrix size
 typedef double MyData;               // do not change
-#define BLOCK                16
+#define BLOCK_               16
+#define BLOCKSIZE            ((BLOCK_) * (BLOCK_))
 
 // sanity check
-#if BLOCK * BLOCK > 1024
+#if BLOCKSIZE > 1024
 #error BLOCKSIZE must be <= 1024
 #endif
 
-#if BLOCK * BLOCK > SIZE
+#if BLOCKSIZE > SIZE
 #error BLOCKSIZE must be <= SIZE
 #endif
 
-#define LOOP 100
+#define LOOP 3
 #define NDEBUG
 
 double wall_time()
@@ -94,20 +96,20 @@ void CPU_mat_mult_block(const MyData *const __restrict__ A,
 	                      MyData *const __restrict__ C,
 			const size_t                 size)
 {
-  const size_t Nblocks = (size / BLOCK);
+  const size_t Nblocks = (size / BLOCK_);
 
   // loop over blocks of matrix C
   for (size_t ib=0 ; ib<Nblocks ; ib++)
     {
       for (size_t jb=0 ; jb<Nblocks ; jb++)
 	{       
-	  // loop over blocks of rows of A       
+	  // loop over blocks of rows of A
 	  for (size_t kb=0 ; kb<Nblocks ; kb++)
 	    {
 	      // Matrix multiplication within a block
-	      for (size_t i=(ib * BLOCK) ; i<((ib + 1) * BLOCK) ; i++)
-		for (size_t j=(jb * BLOCK) ; j<((jb + 1) * BLOCK) ; j++)
-		  for (size_t k=(kb * BLOCK) ; k<((kb + 1) * BLOCK) ; k++)
+	      for (size_t i=(ib * BLOCK_) ; i<((ib + 1) * BLOCK_) ; i++)
+		for (size_t j=(jb * BLOCK_) ; j<((jb + 1) * BLOCK_) ; j++)
+		  for (size_t k=(kb * BLOCK_) ; k<((kb + 1) * BLOCK_) ; k++)
 		    C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
 	    } // kb
 	} // jb
@@ -127,25 +129,25 @@ __global__ void GPU_mat_mult_block(const MyData *const __restrict__ A,
   if (globalID >= size2)
     return;
   
-  const size_t Nblocks  = size / BLOCK;           // number of blocks to loop over A and B
-  const size_t ib       = blockIdx.x / Nblocks;   // indexes of starting matrix's block
+  const size_t Nblocks  = size / BLOCK_;           // number of blocks to loop over A and B
+  const size_t ib       = blockIdx.x / Nblocks;    // indexes of starting matrix's block
   const size_t jb       = blockIdx.x % Nblocks;
-  const size_t i_local  = threadIdx.x / BLOCK;    // local matrix's indexes mapped to each CUDA thread
-  const size_t j_local  = threadIdx.x % BLOCK;    // within its own block
-  const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
-  const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices
+  const size_t i_local  = threadIdx.x / BLOCK_;    // local matrix's indexes mapped to each CUDA thread
+  const size_t j_local  = threadIdx.x % BLOCK_;    // within its own block
+  const size_t i_global = i_local + (ib * BLOCK_); // global matrix's indexes mapped to each CUDA thread
+  const size_t j_global = j_local + (jb * BLOCK_); // N.B. uncoalescent memory accesses to A and B matrices
   
   C[(i_global * size) + j_global] = (MyData)0;
 
   // loop over blocks
   for (size_t block=0 ; block<Nblocks ; block++)
     {
-      const size_t j_A = (block * BLOCK);
-      const size_t i_B = (block * BLOCK);
+      const size_t j_A = (block * BLOCK_);
+      const size_t i_B = (block * BLOCK_);
 
       // perform the matrix multiplication within the block
       MyData value = (MyData)0;
-      for (size_t k=0 ; k<BLOCK ; k++)
+      for (size_t k=0 ; k<BLOCK_ ; k++)
 	value += A[(i_global * size) + k + j_A] * B[((k + i_B) * size) + j_global];
 
       C[(i_global * size) + j_global] += value;
@@ -159,54 +161,54 @@ __global__ void GPU_mat_mult_block_shared(const MyData *const __restrict__ A,
 					        MyData *const __restrict__ C,
 					  const size_t                 size)
 {
-  __shared__ MyData Ablock[BLOCK * BLOCK];
-  __shared__ MyData Bblock[BLOCK * BLOCK];
-  __shared__ MyData Cblock[BLOCK * BLOCK];
+  __shared__ MyData Ablock[BLOCKSIZE];
+  __shared__ MyData Bblock[BLOCKSIZE];
+  __shared__ MyData Cblock[BLOCKSIZE];
   
   const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
   const size_t size2 = (size * size);
   if (globalID >= size2)
     return;
   
-  const size_t Nblocks  = size / BLOCK;           // number of blocks to loop over
-  const size_t ib       = blockIdx.x / Nblocks;   // indexes of starting matrix's block
+  const size_t Nblocks  = size / BLOCK_;           // number of blocks to loop over
+  const size_t ib       = blockIdx.x / Nblocks;    // indexes of starting matrix's block
   const size_t jb       = blockIdx.x % Nblocks;
-  const size_t i_local  = threadIdx.x / BLOCK;    // local matrix's indexes mapped to each CUDA thread
-  const size_t j_local  = threadIdx.x % BLOCK;    // within its own block
-  const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
-  const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices
+  const size_t i_local  = threadIdx.x / BLOCK_;    // local matrix's indexes mapped to each CUDA thread
+  const size_t j_local  = threadIdx.x % BLOCK_;    // within its own block
+  const size_t i_global = i_local + (ib * BLOCK_); // global matrix's indexes mapped to each CUDA thread
+  const size_t j_global = j_local + (jb * BLOCK_); // N.B. uncoalescent memory accesses to A and B matrices
 
   // Init Cblock
-  Cblock[(i_local * BLOCK) + j_local] = (MyData)0;
+  Cblock[(i_local * BLOCK_) + j_local] = (MyData)0;
 
   // loop over blocks
   for (size_t block=0 ; block<Nblocks ; block++)
     {
-      const size_t j_A = (block * BLOCK);
-      const size_t i_B = (block * BLOCK);
+      const size_t j_A = (block * BLOCK_);
+      const size_t i_B = (block * BLOCK_);
 
       // waits until all threads in the thread block have reached this point and shared memory accesses
       // made by these threads prior to __syncthreads() are visible to all threads in the block.
       __syncthreads();
       
       // copy block of A into shared memory
-      Ablock[(i_local * BLOCK) + j_local] = A[(i_global * size) + j_local + j_A];
+      Ablock[(i_local * BLOCK_) + j_local] = A[(i_global * size) + j_local + j_A];
       // copy block of B into shared memory
-      Bblock[(i_local * BLOCK) + j_local] = B[((i_local + i_B) * size) + j_global];
+      Bblock[(i_local * BLOCK_) + j_local] = B[((i_local + i_B) * size) + j_global];
       
       // waits until all threads in the thread block have reached this point and shared memory accesses      // made by these threads prior to __syncthreads() are visible to all threads in the block.
       __syncthreads();
 
       // perform the matrix multiplication within the block
       MyData value = (MyData)0;
-      for (size_t k=0 ; k<BLOCK ; k++)
-	value += Ablock[(i_local * BLOCK) + k] * Bblock[(k * BLOCK) + j_local];
+      for (size_t k=0 ; k<BLOCK_ ; k++)
+	value += Ablock[(i_local * BLOCK_) + k] * Bblock[(k * BLOCK_) + j_local];
 
       // store the partial result in shared memory
-      Cblock[(i_local * BLOCK) + j_local] += value;
+      Cblock[(i_local * BLOCK_) + j_local] += value;
     }
 
-  C[(i_global * size) + j_global] = Cblock[(i_local * BLOCK) + j_local];
+  C[(i_global * size) + j_global] = Cblock[(i_local * BLOCK_) + j_local];
   
   return;
 }
@@ -257,9 +259,13 @@ int main()
   PMT_CREATE(&devID, 1);
   
   ////////////////////////// CPU naive algorithm //////////////////////////////////////////
-  PMT_CPU_START("CPU_mat_mult_block");
-  CPU_mat_mult_block(A_CPU, B_CPU, C_CPU, N);
-  PMT_CPU_STOP("CPU_mat_mult_block");
+  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
+    {
+      PMT_CPU_START("CPU_mat_mult_block");
+      memset(C_CPU, 0, SIZE * sizeof(MyData));
+      CPU_mat_mult_block(A_CPU, B_CPU, C_CPU, N);
+      PMT_CPU_STOP("CPU_mat_mult_block");
+    }
   /////////////////////////////////////////////////////////////////////////////////////////
   
   // copy/alloc data to the GPU
@@ -270,22 +276,22 @@ int main()
   MyData *const C_GPU = B_GPU + SIZE;
   cudaMemcpy(A_GPU, A_CPU, (2 * SIZE * sizeof(MyData)), cudaMemcpyHostToDevice);
 
-  const dim3 nblocks = {(SIZE + (BLOCK * BLOCK)  -1)/(BLOCK * BLOCK), 1, 1};
-  const dim3 block   = {(BLOCK * BLOCK), 1, 1};
+  const unsigned int nblocks = ((SIZE + (BLOCKSIZE) - 1) / BLOCKSIZE);
+  const unsigned int block   = BLOCKSIZE;
   
   /////////////////////////// GPU naive block algorithm ////////////////////////////////////////
   GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
   cudaDeviceSynchronize();
   time = 0.0;
-  PMT_GPU_START("GPU_mat_mult_block", 0);
   for (unsigned short int loop=0 ; loop<LOOP ; loop++)
     {
+      PMT_GPU_START("GPU_mat_mult_block", 0);
       const double start = wall_time();
       GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
       cudaDeviceSynchronize();
       time += (wall_time() - start);
+      PMT_GPU_STOP("GPU_mat_mult_block", 0);
     }
-  PMT_GPU_STOP("GPU_mat_mult_block", 0);
   cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
   
   check(C_CPU, C_GPU_host);
@@ -296,15 +302,15 @@ int main()
   GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
   cudaDeviceSynchronize();
   time = 0.0;
-  PMT_GPU_START("GPU_mat_mult_block_shared", 0);
   for (unsigned short int loop=0 ; loop<LOOP ; loop++)
     {
+      PMT_GPU_START("GPU_mat_mult_block_shared", 0);
       const double start = wall_time();
       GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
       cudaDeviceSynchronize();
       time += (wall_time() - start);
+      PMT_GPU_STOP("GPU_mat_mult_block_shared", 0);
     }
-  PMT_GPU_STOP("GPU_mat_mult_block_shared", 0);
   cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
   
   check(C_CPU, C_GPU_host);
diff --git a/cuda-omp/cuda/energy/mat_mult_debug b/cuda-omp/cuda/energy/mat_mult_debug
new file mode 100755
index 0000000000000000000000000000000000000000..9f6ea1f82cd4464f308f32796057caf125b5db37
Binary files /dev/null and b/cuda-omp/cuda/energy/mat_mult_debug differ
diff --git a/cuda-omp/omp/energy/Makefile b/cuda-omp/omp/energy/Makefile
index 56ef87ef1e4d251627b0ac6dad79d680582aaaa3..5e592b6bb2750e14624d3542da298ea45a5adb9f 100644
--- a/cuda-omp/omp/energy/Makefile
+++ b/cuda-omp/omp/energy/Makefile
@@ -9,30 +9,32 @@ OPTIONS += -D_ENERGY_RAPL_
 OPTIONS += -D_ENERGY_NVIDIA_
 # enable AMD
 OPTIONS += # -D_ENERGY_AMD_
+
+PMT   = /leonardo/home/userexternal/dgoz0000/lib/pmt/local
+INC   = -I$(PMT)/include
+LIB   = -L$(PMT)/lib64 -lpmt -lm
+
 endif
 
-CC  = gcc
-CPP = g++
-NVC = nvc++
-OPT = -Wall -Wextra -O3 -fopenmp -march=native
-INC = /home/darkenergy/software/pmt/include
-LIB = /home/darkenergy/software/pmt/lib -lpmt -lm
+NVC   = nvc
+NVCPP = nvc++ -std=c++17
+CPP   = g++ -std=c++17
+OPT   = -O3 -mp=gpu,multicore -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
 
-all: prog
+all: multiple_devices
 .PHONY: clean test
 
-mat_mult_block.o: mat_mult_block.cu energy/energy_pmt.h Makefile
-	$(NVC) -O3 $(OPTIONS) $< -c $@
+energy_pmt_methods.o: energy/energy_pmt_methods.cpp energy/energy_pmt.h energy/energy_pmt_methods.h Makefile
+	$(CPP) $(OPTIONS) $(INC) -c $< -o $@
 
-energy_pmt_methods.o: energy/energy_pmt_methods.cpp energy/energy_pmt_methods.h energy/energy_pmt.h Makefile
-	$(CPP) $(OPTIONS) -I$(INC) $< -c $@
+multiple_devices.o: multiple_devices.c energy/energy_pmt.h Makefile
+	$(NVC) $(OPT) $(OPTIONS) -c $< -o $@
 
-mat_mult: mat_mult_block.o energy_pmt_methods.o
-	$(NVC) $(OPTIONS) $^ -o $@ -L$(LIB)
-	ldd mat_mult
+multiple_devices: multiple_devices.o energy_pmt_methods.o
+	$(NVCPP) $(OPT) $(OPTIONS) $^ -o $@ $(LIB)
 
-test: mat_mult
-	./mat_mult
+test: multiple_devices
+	./multiple_devices
 
 clean:
-	rm -rf *.o mat_mult *~ energy/*~ energy/*.o
+	rm -rf *.o multiple_devices *~ ./energy/*~
diff --git a/cuda-omp/omp/energy/energy/energy_pmt.h b/cuda-omp/omp/energy/energy/energy_pmt.h
index b59d1b14c647cd5898050474337a8dd60f9c1bd5..adf05e03ae38782ec2ef5f090473d3340a3002f6 100644
--- a/cuda-omp/omp/energy/energy/energy_pmt.h
+++ b/cuda-omp/omp/energy/energy/energy_pmt.h
@@ -3,9 +3,9 @@
 #include "energy_pmt_methods.h"
 
 #if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
-   #define PMT_CREATE(numGPUs) Create_PMT((numGPUs))
+   #define PMT_CREATE(devID, numGPUs) Create_PMT((devID), (numGPUs))
 #else
-   #define PMT_CREATE(numGPUs)
+   #define PMT_CREATE(devID, numGPUs)
 #endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
 
 #if defined(_ENERGY_RAPL_)
@@ -19,12 +19,11 @@
 #endif // _ENERGY_RAPL_
 
 #if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
-   #define PMT_GPU_START(string) Start_PMT_GPU((string))
-   #define PMT_GPU_STOP(string)  Stop_PMT_GPU((string))
-   #define PMT_GPU_SHOW(string)  Show_PMT_GPU((string))
+   #define PMT_GPU_START(string, devID) Start_PMT_GPU((string), (devID))
+   #define PMT_GPU_STOP(string, devID)  Stop_PMT_GPU((string), (devID))
+   #define PMT_GPU_SHOW(string)         Show_PMT_GPU((string))
 #else
-   #define PMT_GPU_START(string)
-   #define PMT_GPU_STOP(string)
+   #define PMT_GPU_START(string, devID)
+   #define PMT_GPU_STOP(string, devID)
    #define PMT_GPU_SHOW(string)
 #endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
-
diff --git a/cuda-omp/omp/energy/energy/energy_pmt_methods.cpp b/cuda-omp/omp/energy/energy/energy_pmt_methods.cpp
index a5863426e8dcf1b4dd21b0edfe6f638f2cb25306..d7a82fa5ec77bac1f2026fb8963480a58e845089 100644
--- a/cuda-omp/omp/energy/energy/energy_pmt_methods.cpp
+++ b/cuda-omp/omp/energy/energy/energy_pmt_methods.cpp
@@ -32,35 +32,41 @@ void PMT_err(void)
 #endif // _ENERGY_RAPL_
 
 #if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
-   static std::vector<std::unique_ptr<pmt::PMT>> sensor_gpu;
-   static std::vector<std::map<std::string, EnergyState>> state_gpu;
+   static std::map<int, std::unique_ptr<pmt::PMT>> sensor_gpu;
+   static std::map<int, std::map<std::string, EnergyState>> state_gpu;
 #endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
 
-extern "C"
-{
-#include "energy_pmt_methods.h"
-}
+#if !defined(__NVCC__) && !defined(__NVCOMPILER)
+   extern "C"
+      {
+         #include "energy_pmt_methods.h"
+      }
+#endif // !defined(__NVCC__) && !defined(__NVCOMPILER)
 
 #if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
-void Create_PMT(const unsigned int numGPUs)
+void Create_PMT(int *devID,
+		const unsigned int numGPUs)
 {
 #if defined(_ENERGY_RAPL_)
-  
-  sensor_cpu = pmt::Create("rapl");
+
+  sensor_cpu = pmt::rapl::Rapl::Create();
 
 #endif // _ENERGY_RAPL_
   
-  if (numGPUs > 0)
+  if ((numGPUs > 0) && (devID != nullptr))
     {
       for (unsigned int dev=0 ; dev<numGPUs ; dev++)
 	{
+#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+	  sensor_gpu.insert({devID[dev],
+
 #if defined(_ENERGY_NVIDIA_)
-	  sensor_gpu.push_back(pmt::nvml::NVML::Create(dev));
-#endif // _ENERGY_NVIDIA_
+	                     pmt::nvml::NVML::Create(devID[dev])});
+#elif defined(_ENERGY_AMD_)
+	                     pmt::rocm::AMD::Create(devID[dev])});
+#endif
 
-#if defined(_ENERGY_AMD_)
-	  sensor_gpu.push_back(pmt::rocm::AMD::Create(dev));
-#endif // _ENERGY_AMD_
+#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
 	} // numGPUs
     }
 
@@ -77,19 +83,25 @@ void Start_PMT_CPU(const char *label)
       return;
     }
 
+  const std::string tag{std::string{label}};
+
   // check if the label already exists
-  if (state_cpu.count(std::string{label}))
+  if (state_cpu.count(tag))
     {
-      state_cpu[std::string{label}].start = sensor_cpu->Read();
+      state_cpu[tag].start = sensor_cpu->Read();
     }
   else
     {
+      // create new EnergyState
+      const EnergyState newState{sensor_cpu->Read(),
+                                 static_cast<pmt::State>(0),
+                                 static_cast<double>(0),
+                                 static_cast<double>(0),
+                                 static_cast<double>(0),
+                                 static_cast<unsigned int>(0)};
+
       // insert the key and initialize the counters
-      state_cpu.insert({std::string{label},
-			{sensor_cpu->Read(),
-			 0,
-			 0.0, 0.0, 0.0, 0
-			}});
+      state_cpu.insert(std::pair<std::string, EnergyState>(tag, newState));
     }
 
   return;
@@ -102,10 +114,12 @@ void Stop_PMT_CPU(const char *label)
       PMT_err();
       return;
     }
+
+  const std::string tag{std::string{label}};
   
   // check if the label already exists
   // if not error
-  if (!state_cpu.count(std::string{label}))
+  if (!state_cpu.count(tag))
     {
       PMT_ERROR = true;
       PMT_err();
@@ -113,23 +127,20 @@ void Stop_PMT_CPU(const char *label)
     }
   else
     {
+      // get the energy state
+      EnergyState &State = state_cpu[tag];
+      
       // read the counter
-      state_cpu[std::string{label}].stop = sensor_cpu->Read();
+      State.stop = sensor_cpu->Read();
 
       // update quantities
-      state_cpu[std::string{label}].seconds +=
-	sensor_cpu->seconds(state_cpu[std::string{label}].start,
-			    state_cpu[std::string{label}].stop);
+      State.seconds += sensor_cpu->seconds(State.start, State.stop);
       
-      state_cpu[std::string{label}].joules +=
-	sensor_cpu->joules(state_cpu[std::string{label}].start,
-			   state_cpu[std::string{label}].stop);
+      State.joules  += sensor_cpu->joules(State.start, State.stop);
 
-      state_cpu[std::string{label}].watts +=
-	sensor_cpu->watts(state_cpu[std::string{label}].start,
-			  state_cpu[std::string{label}].stop);
+      State.watts   += sensor_cpu->watts(State.start, State.stop);
 
-      state_cpu[std::string{label}].count++;
+      State.count++;
     }
   
   return;
@@ -137,21 +148,142 @@ void Stop_PMT_CPU(const char *label)
 
 void Show_PMT_CPU(const char *label)
 {
-  if (PMT_ERROR || !state_cpu.count(std::string{label}))
+  const std::string tag{std::string{label}};
+  
+  if (PMT_ERROR || !state_cpu.count(tag))
     {
       PMT_err();
       return;
     }
   else
     {
-      std::cout << "\n\t Kernel:" << std::string{label} << ":" << std::endl;
-      std::cout << "\t\t" << state_cpu[std::string{label}].seconds << " [S]" << std::endl;
-      std::cout << "\t\t" << state_cpu[std::string{label}].joules  << " [J]" << std::endl;
-      std::cout << "\t\t" << state_cpu[std::string{label}].watts / state_cpu[std::string{label}].count  << " [W]" << "\n" << std::endl;
+      std::cout << "\n\t CPU Kernel:" << tag << ":" << std::endl;
+      std::cout << "\t\t" << state_cpu[tag].seconds << " [S]" << std::endl;
+      std::cout << "\t\t" << state_cpu[tag].joules  << " [J]" << std::endl;
+      std::cout << "\t\t" << state_cpu[tag].watts / state_cpu[tag].count  << " [W]" << "\n" << std::endl;
     }
   
   return;
 }
 #endif // _ENERGY_RAPL_
 
+#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+void Start_PMT_GPU(const char *label,
+		   const int  devID)
+{
+  if (PMT_ERROR || !sensor_gpu.count(devID))
+    {
+      PMT_err();
+      return;
+    }
+
+  // check if the devID already exists
+  if (!state_gpu.count(devID))
+    {
+      // insert devID
+      state_gpu.insert(std::pair<int, std::map<std::string, EnergyState>>(devID, {}));
+    }
+
+  const std::string tag{std::string{label}};
+  
+  // check if the label already exists
+  if (state_gpu[devID].count(tag))
+    {
+      // read the sensor
+      state_gpu[devID][tag].start = sensor_gpu[devID]->Read();
+    }
+  else
+    {
+      // insert the label and initialize the counters
+      const EnergyState newState{sensor_gpu[devID]->Read(),
+                                 static_cast<pmt::State>(0),
+                                 static_cast<double>(0),
+                                 static_cast<double>(0),
+                                 static_cast<double>(0),
+                                 static_cast<unsigned int>(0)};
+
+      state_gpu[devID].insert(std::pair<std::string, EnergyState>(tag, newState));      
+    }
+
+  return;
+}
+
+void Stop_PMT_GPU(const char *label,
+		  const int   devID)
+{
+  // check if the devID already exists
+  // if not error
+  if (!state_gpu.count(devID) || PMT_ERROR || !sensor_gpu.count(devID))
+    {
+      PMT_ERROR = true;
+      PMT_err();
+      return;
+    }
+  else
+    {
+      const std::string tag{std::string{label}};
+      
+      // check if the label already exists
+      // if not error
+      if (!state_gpu[devID].count(tag))
+	{
+	  PMT_ERROR = true;
+	  PMT_err();
+	  return;
+	}
+      else
+	{
+	  EnergyState &State = state_gpu[devID][tag];
+	  
+	  // read the counter
+	  State.stop = sensor_gpu[devID]->Read();
+
+	  // update quantities
+	  State.seconds +=
+	    sensor_gpu[devID]->seconds(State.start,
+				       State.stop);
+      
+	  State.joules +=
+	    sensor_gpu[devID]->joules(State.start,
+				      State.stop);
+
+	  State.watts +=
+	    sensor_gpu[devID]->watts(State.start,
+				     State.stop);
+      
+	  State.count++;
+	}
+    }
+  
+  return;
+}
+
+void Show_PMT_GPU(const char *label)
+{
+  if (PMT_ERROR)
+    {
+      PMT_err();
+      return;
+    }
+  else
+    {
+      const std::string tag{std::string{label}};
+
+      for (const auto &[key, value]: state_gpu)
+	{
+	  if (value.count(tag))
+	    {
+	      std::cout << "\n\t GPU [" << key << "] kernel:" << tag << ":" << std::endl;
+	      std::cout << "\t\t" << value.at(tag).seconds << " [s]" << std::endl;
+	      std::cout << "\t\t" << value.at(tag).joules  << " [J]" << std::endl;
+	      std::cout << "\t\t" << value.at(tag).watts / value.at(tag).count  << " [W]" << "\n" << std::endl;
+	    }
+	}
+    }
+  
+  return;
+}
+
+#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+
 #endif // _ENERGY_PMT_
diff --git a/cuda-omp/omp/energy/energy/energy_pmt_methods.h b/cuda-omp/omp/energy/energy/energy_pmt_methods.h
index a0479c644f11e93a4cb9d49a3826577d33bf1698..6b49d915972463850e076ab3aa063a7caa11174e 100644
--- a/cuda-omp/omp/energy/energy/energy_pmt_methods.h
+++ b/cuda-omp/omp/energy/energy/energy_pmt_methods.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
-   void Create_PMT(const unsigned int numGPUs);
+   void Create_PMT(int *devID, const unsigned int numGPUs);
 #endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
 
 #if defined(_ENERGY_RAPL_)
@@ -11,8 +11,8 @@
 #endif // _ENERGY_RAPL_
 
 #if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
-   void Start_PMT_GPU(const char *string);
-   void Stop_PMT_GPU(const char *string);
+   void Start_PMT_GPU(const char *string, const int);
+   void Stop_PMT_GPU(const char *string, const int);
    void Show_PMT_GPU(const char *string);
 #endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
 
diff --git a/cuda-omp/omp/energy/multiple_devices.c b/cuda-omp/omp/energy/multiple_devices.c
index c8212b8ac0afbd84837a7c0d11db061ed3c5a9ee..f99dffd73146ba9dbb61e20b1b2c2b23e1181c5e 100644
--- a/cuda-omp/omp/energy/multiple_devices.c
+++ b/cuda-omp/omp/energy/multiple_devices.c
@@ -16,11 +16,12 @@
 //   $ ./multiple_devices_omp
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-
+#include <unistd.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <omp.h>
 #include <assert.h>
+#include "energy/energy_pmt.h"
 
 typedef int MyData;
 #define N_PER_DEV   1000000
@@ -39,6 +40,23 @@ typedef int MyData;
 
 #define NDEBUG
 
+void check(const MyData *const restrict A,
+	   const MyData *const restrict B,
+	   const int                    size)
+{
+  int flag = 0;
+
+  for (int i=0 ; i<size ; i++)
+    flag = ((A[i] != B[i]) ? 1 : flag);
+
+  if (flag)
+    printf("\n\t Result wrong \n");
+  else
+    printf("\n\t Result OK \n");
+
+  return;
+}
+
 void VectorAdd(const MyData *const restrict A,
 	       const MyData *const restrict B,
 	             MyData *const restrict C,
@@ -84,6 +102,12 @@ int main()
   // get the number of the available devices
   const int NumDev = omp_get_num_devices();
 
+  int devID[NumDev];
+  for (int i=0 ; i<NumDev ; i++)
+    devID[i] = i;
+
+  PMT_CREATE(devID, NumDev);
+  
   const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE);
   
   // global vector size
@@ -97,6 +121,7 @@ int main()
   MyData *const restrict C_CPU = B + size;
   MyData *const restrict C_GPU = C_CPU + size;
 
+  PMT_CPU_START("HostVecAdd");
   #pragma omp parallel for simd
   for (int i=0 ; i<size ; i++)
     {
@@ -104,6 +129,7 @@ int main()
       B[i] = rand() % N_PER_DEV;
       C_CPU[i] = A[i] + B[i];
     }
+  PMT_CPU_STOP("HostVecAdd");
 
   // each device is managed by a single OMP thread
   #pragma omp parallel num_threads(NumDev)
@@ -133,7 +159,12 @@ int main()
   for (int dev=0 ; dev<NumDev ; dev++)
     {
       const int offset = (dev * N_PER_DEV);
+
+      PMT_GPU_START("GpuVecAdd", dev);
+
       VectorAdd(A, B, C_GPU, offset, N_PER_DEV, dev, nblocks, TRUE);
+
+      PMT_GPU_STOP("GpuVecAdd", dev);
     }
 
   // host-devices synchronization
@@ -141,6 +172,9 @@ int main()
   check(C_CPU, C_GPU, size);
   
   free(buffer);
+
+  PMT_CPU_SHOW("HostVecAdd");
+  PMT_GPU_SHOW("GpuVecAdd");
   
   return 0;
 }
diff --git a/cuda-omp/omp/miscellaneous/multiple_devices.c b/cuda-omp/omp/miscellaneous/multiple_devices.c
index c8212b8ac0afbd84837a7c0d11db061ed3c5a9ee..9f7121b15671d5db2c40a245f2a29528895802e3 100644
--- a/cuda-omp/omp/miscellaneous/multiple_devices.c
+++ b/cuda-omp/omp/miscellaneous/multiple_devices.c
@@ -126,7 +126,7 @@ int main()
     VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks, FALSE);
   } // omp parallel
 
-  check(C_CPU, C_GPU, size);
+  //  check(C_CPU, C_GPU, size);
   memset(C_GPU, 0, (size * sizeof(MyData)));
   
   // one OMP thread manages asynchronously all the devices
@@ -138,7 +138,7 @@ int main()
 
   // host-devices synchronization
   #pragma omp taskwait
-  check(C_CPU, C_GPU, size);
+  // check(C_CPU, C_GPU, size);
   
   free(buffer);