diff --git a/cuda-omp/cuda/energy/Makefile b/cuda-omp/cuda/energy/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..1c5cde978ab092b36b2b2a50ee4e5c732a655822 --- /dev/null +++ b/cuda-omp/cuda/energy/Makefile @@ -0,0 +1,30 @@ +ENERGY ?= YES + +ifeq ($(ENERGY), YES) +OPTIONS += -D_ENERGY_PMT_ + +# enable RAPL +OPTIONS += -D_ENERGY_RAPL_ +# enable NVIDIA +OPTIONS += -D_ENERGY_NVIDIA_ +# enable AMD +OPTIONS += # -D_ENERGY_AMD_ +endif + +NVCPP = nvc++ +OPT = -O3 +INC = /home/darkenergy/software/pmt/include +LIB = /home/darkenergy/software/pmt/lib -lpmt -lm + +all: mat_mult +.PHONY: clean test + +mat_mult: mat_mult_block.cu energy/energy_pmt_methods.cpp energy/energy_pmt.h energy/energy_pmt_methods.h Makefile + $(NVCPP) $(OPT) $(OPTIONS) -I$(INC) mat_mult_block.cu energy/energy_pmt_methods.cpp -o $@ -L$(LIB) + ldd ./mat_mult + +test: mat_mult + ./mat_mult + +clean: + rm -rf *.o mat_mult *~ energy/*~ energy/*.o diff --git a/cuda-omp/cuda/energy/energy/energy_pmt.h b/cuda-omp/cuda/energy/energy/energy_pmt.h new file mode 100644 index 0000000000000000000000000000000000000000..2204ca2d11eb194ec691526710689eeff221b217 --- /dev/null +++ b/cuda-omp/cuda/energy/energy/energy_pmt.h @@ -0,0 +1,30 @@ +#pragma once + +#include "energy_pmt_methods.h" + +#if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + #define PMT_CREATE(devID, numGPUs) Create_PMT((devID), (numGPUs)) +#else + #define PMT_CREATE(numGPUs) +#endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + +#if defined(_ENERGY_RAPL_) + #define PMT_CPU_START(string) Start_PMT_CPU((string)) + #define PMT_CPU_STOP(string) Stop_PMT_CPU((string)) + #define PMT_CPU_SHOW(string) Show_PMT_CPU((string)) +#else + #define PMT_CPU_START(string) + #define PMT_CPU_STOP(string) + #define PMT_CPU_SHOW(string) +#endif // _ENERGY_RAPL_ + +#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + #define PMT_GPU_START(string, devID) Start_PMT_GPU((string), (devID)) + #define PMT_GPU_STOP(string, devID) Stop_PMT_GPU((string), (devID)) + #define PMT_GPU_SHOW(string) Show_PMT_GPU((string)) +#else + #define PMT_GPU_START(string, dev) + #define PMT_GPU_STOP(string, dev) + #define PMT_GPU_SHOW(string) +#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + diff --git a/cuda-omp/cuda/energy/energy/energy_pmt_methods.cpp b/cuda-omp/cuda/energy/energy/energy_pmt_methods.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f41101d24385d703c45c5948e6cdb67ced827471 --- /dev/null +++ b/cuda-omp/cuda/energy/energy/energy_pmt_methods.cpp @@ -0,0 +1,269 @@ +#if defined(_ENERGY_PMT_) + +#include <pmt.h> +#include <assert.h> +#include <iostream> +#include <vector> +#include <string> +#include <map> + +struct EnergyState +{ + pmt::State start; + pmt::State stop; + double joules; + double watts; + double seconds; + unsigned int count; +}; + +static bool PMT_ERROR{false}; + +void PMT_err(void) +{ + std::cout << "\n\t PMT Error \n" << std::endl; + + return; +} + +#if defined(_ENERGY_RAPL_) + static std::unique_ptr<pmt::PMT> sensor_cpu; + static std::map<std::string, EnergyState> state_cpu; +#endif // _ENERGY_RAPL_ + +#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + static std::map<int, std::unique_ptr<pmt::PMT>> sensor_gpu; + static std::map<int, std::map<std::string, EnergyState>> state_gpu; +#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + +#if !defined(__NVCC__) && !defined(__NVCOMPILER) + extern "C" + { + #include "energy_pmt_methods.h" + } +#endif // !defined(__NVCC__) && !defined(__NVCOMPILER) + +#if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) +void Create_PMT(int *devID, + const unsigned int numGPUs) +{ +#if defined(_ENERGY_RAPL_) + + sensor_cpu = pmt::Create("rapl"); + +#endif // _ENERGY_RAPL_ + + if ((numGPUs > 0) && (devID != nullptr)) + { + for (unsigned int dev=0 ; dev<numGPUs ; dev++) + { +#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + sensor_gpu.insert({devID[dev], + +#if defined(_ENERGY_NVIDIA_) + pmt::nvml::NVML::Create(dev)}); +#elif defined(_ENERGY_AMD_) + pmt::rocm::AMD::Create(dev)}); +#endif + +#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + } // numGPUs + } + + return; +} +#endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + +#if defined(_ENERGY_RAPL_) +void Start_PMT_CPU(const char *label) +{ + if (PMT_ERROR) + { + PMT_err(); + return; + } + + // check if the label already exists + if (state_cpu.count(std::string{label})) + { + state_cpu[std::string{label}].start = sensor_cpu->Read(); + } + else + { + // insert the key and initialize the counters + state_cpu.insert({std::string{label}, + {sensor_cpu->Read(), + 0, + 0.0, 0.0, 0.0, 0 + }}); + } + + return; +} + +void Stop_PMT_CPU(const char *label) +{ + if (PMT_ERROR) + { + PMT_err(); + return; + } + + // check if the label already exists + // if not error + if (!state_cpu.count(std::string{label})) + { + PMT_ERROR = true; + PMT_err(); + return; + } + else + { + // read the counter + state_cpu[std::string{label}].stop = sensor_cpu->Read(); + + // update quantities + state_cpu[std::string{label}].seconds += + sensor_cpu->seconds(state_cpu[std::string{label}].start, + state_cpu[std::string{label}].stop); + + state_cpu[std::string{label}].joules += + sensor_cpu->joules(state_cpu[std::string{label}].start, + state_cpu[std::string{label}].stop); + + state_cpu[std::string{label}].watts += + sensor_cpu->watts(state_cpu[std::string{label}].start, + state_cpu[std::string{label}].stop); + + state_cpu[std::string{label}].count++; + } + + return; +} + +void Show_PMT_CPU(const char *label) +{ + if (PMT_ERROR || !state_cpu.count(std::string{label})) + { + PMT_err(); + return; + } + else + { + std::cout << "\n\t CPU Kernel:" << std::string{label} << ":" << std::endl; + std::cout << "\t\t" << state_cpu[std::string{label}].seconds << " [S]" << std::endl; + std::cout << "\t\t" << state_cpu[std::string{label}].joules << " [J]" << std::endl; + std::cout << "\t\t" << state_cpu[std::string{label}].watts / state_cpu[std::string{label}].count << " [W]" << "\n" << std::endl; + } + + return; +} +#endif // _ENERGY_RAPL_ + +#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) +void Start_PMT_GPU(const char *label, + const int devID) +{ + if (PMT_ERROR || !sensor_gpu.count(devID)) + { + PMT_err(); + return; + } + + // check if the devID already exists + if (!state_gpu.count(devID)) + { + // insert devID + state_gpu.insert({devID, {}}); + } + // check if the label already exists + if (state_gpu[devID].count(std::string{label})) + { + // read the sensor + state_gpu[devID][std::string{label}].start = sensor_gpu[devID]->Read(); + } + else + { + // insert the label and initialize the counters + state_gpu[devID].insert({std::string{label}, + { + sensor_gpu[devID]->Read(), + 0, + 0.0, 0.0, 0.0, 0 + }}); + } + + return; +} + +void Stop_PMT_GPU(const char *label, + const int devID) +{ + // check if the devID already exists + // if not error + if (!state_gpu.count(devID) || PMT_ERROR || !sensor_gpu.count(devID)) + { + PMT_ERROR = true; + PMT_err(); + return; + } + else + { + // check if the label already exists + // if not error + if (!state_gpu[devID].count(std::string{label})) + { + PMT_ERROR = true; + PMT_err(); + return; + } + else + { + // read the counter + state_gpu[devID][std::string{label}].stop = sensor_gpu[devID]->Read(); + + // update quantities + state_gpu[devID][std::string{label}].seconds += + sensor_gpu[devID]->seconds(state_gpu[devID][std::string{label}].start, + state_gpu[devID][std::string{label}].stop); + + state_gpu[devID][std::string{label}].joules += + sensor_gpu[devID]->joules(state_gpu[devID][std::string{label}].start, + state_gpu[devID][std::string{label}].stop); + + state_gpu[devID][std::string{label}].watts += + sensor_gpu[devID]->watts(state_gpu[devID][std::string{label}].start, + state_gpu[devID][std::string{label}].stop); + + state_gpu[devID][std::string{label}].count++; + } + } + + return; +} + +void Show_PMT_GPU(const char *label) +{ + if (PMT_ERROR) + { + PMT_err(); + return; + } + else + { + // show quantities for all devices + for (const auto& [key, value]: state_gpu) + { + std::cout << "\n\t GPU [" << key << "] kernel:" << std::string{label} << ":" << std::endl; + std::cout << "\t\t" << value.at(std::string{label}).seconds << " [s]" << std::endl; + std::cout << "\t\t" << value.at(std::string{label}).joules << " [J]" << std::endl; + std::cout << "\t\t" << value.at(std::string{label}).watts / value.at(std::string{label}).count << " [W]" << "\n" << std::endl; + } + } + + return; +} + +#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + +#endif // _ENERGY_PMT_ diff --git a/cuda-omp/cuda/energy/energy/energy_pmt_methods.h b/cuda-omp/cuda/energy/energy/energy_pmt_methods.h new file mode 100644 index 0000000000000000000000000000000000000000..6b49d915972463850e076ab3aa063a7caa11174e --- /dev/null +++ b/cuda-omp/cuda/energy/energy/energy_pmt_methods.h @@ -0,0 +1,19 @@ +#pragma once + +#if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + void Create_PMT(int *devID, const unsigned int numGPUs); +#endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + +#if defined(_ENERGY_RAPL_) + void Start_PMT_CPU(const char *string); + void Stop_PMT_CPU(const char *string); + void Show_PMT_CPU(const char *string); +#endif // _ENERGY_RAPL_ + +#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + void Start_PMT_GPU(const char *string, const int); + void Stop_PMT_GPU(const char *string, const int); + void Show_PMT_GPU(const char *string); +#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + + diff --git a/cuda-omp/cuda/energy/mat_mult_block.cu b/cuda-omp/cuda/energy/mat_mult_block.cu new file mode 100644 index 0000000000000000000000000000000000000000..5e47928c3a6de1b58567d847537909743bc2bc75 --- /dev/null +++ b/cuda-omp/cuda/energy/mat_mult_block.cu @@ -0,0 +1,326 @@ +//////////////////////////////////////////////////////////////////////////////////////////////// +// - Block matrix multiplication algorithm +// +// const size_t Nblocks = (N / Bsize); +// +// // loop over blocks of matrix C +// for (size_t ib=0 ; ib<Nblocks ; ib++) +// { +// for (size_t jb=0 ; jb<Nblocks ; jb++) +// { +// +// // loop over blocks of rows of A +// for (size_t kb=0 ; kb<Nblocks ; kb++) +// { +// +// // Matrix multiplication within a block +// for (size_t i=(ib * Bsize) ; i<((ib + 1) * Bsize) ; i++) +// for (size_t j=(jb * Bsize) ; j<((jb + 1) * Bsize) ; j++) +// for (size_t k=(kb * Bsize) ; k<((kb + 1) * Bsize) ; k++) +// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; +// +// } // kb +// } // jb +// } // ib +// +// - Exploit GPU shared-memory. +//////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 22.08.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc++ mat_mult_block.cu -o mat_mult_block +// - Run the code: +// $ ./mat_mult_block +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <time.h> +#include <assert.h> +#include <cuda.h> +#include <string.h> +#include <math.h> +#include <float.h> + +#include "energy/energy_pmt.h" + +#define N 1024 +#define SIZE (N * N) // matrix size +typedef double MyData; // do not change +#define BLOCK 16 + +// sanity check +#if BLOCK * BLOCK > 1024 +#error BLOCKSIZE must be <= 1024 +#endif + +#if BLOCK * BLOCK > SIZE +#error BLOCKSIZE must be <= SIZE +#endif + +#define LOOP 100 +#define NDEBUG + +double wall_time() +{ + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; + + return ret; +} + +void CPU_mat_mult(const MyData *const __restrict__ A, + const MyData *const __restrict__ B, + MyData *const __restrict__ C, + const size_t size) +{ + for (size_t i=0 ; i<size ; i++) + for (size_t j=0 ; j<size ; j++) + for (size_t k=0 ; k<size ; k++) + C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]); + + return; +} + +void CPU_mat_mult_block(const MyData *const __restrict__ A, + const MyData *const __restrict__ B, + MyData *const __restrict__ C, + const size_t size) +{ + const size_t Nblocks = (size / BLOCK); + + // loop over blocks of matrix C + for (size_t ib=0 ; ib<Nblocks ; ib++) + { + for (size_t jb=0 ; jb<Nblocks ; jb++) + { + // loop over blocks of rows of A + for (size_t kb=0 ; kb<Nblocks ; kb++) + { + // Matrix multiplication within a block + for (size_t i=(ib * BLOCK) ; i<((ib + 1) * BLOCK) ; i++) + for (size_t j=(jb * BLOCK) ; j<((jb + 1) * BLOCK) ; j++) + for (size_t k=(kb * BLOCK) ; k<((kb + 1) * BLOCK) ; k++) + C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; + } // kb + } // jb + } // ib + + return; +} + +__global__ void GPU_mat_mult_block(const MyData *const __restrict__ A, + const MyData *const __restrict__ B, + MyData *const __restrict__ C, + const size_t size) +{ + const size_t size2 = (size * size); + + const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x); + if (globalID >= size2) + return; + + const size_t Nblocks = size / BLOCK; // number of blocks to loop over A and B + const size_t ib = blockIdx.x / Nblocks; // indexes of starting matrix's block + const size_t jb = blockIdx.x % Nblocks; + const size_t i_local = threadIdx.x / BLOCK; // local matrix's indexes mapped to each CUDA thread + const size_t j_local = threadIdx.x % BLOCK; // within its own block + const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread + const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices + + C[(i_global * size) + j_global] = (MyData)0; + + // loop over blocks + for (size_t block=0 ; block<Nblocks ; block++) + { + const size_t j_A = (block * BLOCK); + const size_t i_B = (block * BLOCK); + + // perform the matrix multiplication within the block + MyData value = (MyData)0; + for (size_t k=0 ; k<BLOCK ; k++) + value += A[(i_global * size) + k + j_A] * B[((k + i_B) * size) + j_global]; + + C[(i_global * size) + j_global] += value; + } + + return; +} + +__global__ void GPU_mat_mult_block_shared(const MyData *const __restrict__ A, + const MyData *const __restrict__ B, + MyData *const __restrict__ C, + const size_t size) +{ + __shared__ MyData Ablock[BLOCK * BLOCK]; + __shared__ MyData Bblock[BLOCK * BLOCK]; + __shared__ MyData Cblock[BLOCK * BLOCK]; + + const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x); + const size_t size2 = (size * size); + if (globalID >= size2) + return; + + const size_t Nblocks = size / BLOCK; // number of blocks to loop over + const size_t ib = blockIdx.x / Nblocks; // indexes of starting matrix's block + const size_t jb = blockIdx.x % Nblocks; + const size_t i_local = threadIdx.x / BLOCK; // local matrix's indexes mapped to each CUDA thread + const size_t j_local = threadIdx.x % BLOCK; // within its own block + const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread + const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices + + // Init Cblock + Cblock[(i_local * BLOCK) + j_local] = (MyData)0; + + // loop over blocks + for (size_t block=0 ; block<Nblocks ; block++) + { + const size_t j_A = (block * BLOCK); + const size_t i_B = (block * BLOCK); + + // waits until all threads in the thread block have reached this point and shared memory accesses + // made by these threads prior to __syncthreads() are visible to all threads in the block. + __syncthreads(); + + // copy block of A into shared memory + Ablock[(i_local * BLOCK) + j_local] = A[(i_global * size) + j_local + j_A]; + // copy block of B into shared memory + Bblock[(i_local * BLOCK) + j_local] = B[((i_local + i_B) * size) + j_global]; + + // waits until all threads in the thread block have reached this point and shared memory accesses // made by these threads prior to __syncthreads() are visible to all threads in the block. + __syncthreads(); + + // perform the matrix multiplication within the block + MyData value = (MyData)0; + for (size_t k=0 ; k<BLOCK ; k++) + value += Ablock[(i_local * BLOCK) + k] * Bblock[(k * BLOCK) + j_local]; + + // store the partial result in shared memory + Cblock[(i_local * BLOCK) + j_local] += value; + } + + C[(i_global * size) + j_global] = Cblock[(i_local * BLOCK) + j_local]; + + return; +} + +void check(const MyData *const __restrict__ cpu_matrix, + const MyData *const __restrict__ gpu_matrix) +{ + int flag = 0; + for (size_t i=0 ; i<SIZE ; i++) + flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag); + + if (!flag) + printf("\n\t Result OK"); + else + printf("\n\t Result wrong"); + + return; +} + +int main() +{ + double time; + MyData *buffer_cpu = (MyData *)calloc(4 * SIZE, sizeof(MyData)); + assert(buffer_cpu != NULL); + + // host reference matrix A + MyData *const __restrict__ A_CPU = buffer_cpu; + MyData *const __restrict__ B_CPU = A_CPU + SIZE; + MyData *const __restrict__ C_CPU = B_CPU + SIZE; + MyData *const __restrict__ C_GPU_host = C_CPU + SIZE; + for (size_t i=0 ; i<SIZE ; i++) + { + A_CPU[i] = drand48(); + B_CPU[i] = drand48(); + } + + // Get the number of compute-capable devices + int NumGPUs{-1}; + cudaGetDeviceCount(&NumGPUs); + if (NumGPUs <= 0) + { + printf("\n\t Nvidia-GPU is not available \n\n"); + return EXIT_FAILURE; + } + + int devID = 0; + // Init PMT + PMT_CREATE(&devID, 1); + + ////////////////////////// CPU naive algorithm ////////////////////////////////////////// + PMT_CPU_START("CPU_mat_mult_block"); + CPU_mat_mult_block(A_CPU, B_CPU, C_CPU, N); + PMT_CPU_STOP("CPU_mat_mult_block"); + ///////////////////////////////////////////////////////////////////////////////////////// + + // copy/alloc data to the GPU + MyData *buffer_gpu = NULL; + cudaMalloc((void **)&buffer_gpu, (3 * SIZE * sizeof(MyData))); + MyData *const A_GPU = buffer_gpu; + MyData *const B_GPU = A_GPU + SIZE; + MyData *const C_GPU = B_GPU + SIZE; + cudaMemcpy(A_GPU, A_CPU, (2 * SIZE * sizeof(MyData)), cudaMemcpyHostToDevice); + + const dim3 nblocks = {(SIZE + (BLOCK * BLOCK) -1)/(BLOCK * BLOCK), 1, 1}; + const dim3 block = {(BLOCK * BLOCK), 1, 1}; + + /////////////////////////// GPU naive block algorithm //////////////////////////////////////// + GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up + cudaDeviceSynchronize(); + time = 0.0; + PMT_GPU_START("GPU_mat_mult_block", 0); + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + const double start = wall_time(); + GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); + cudaDeviceSynchronize(); + time += (wall_time() - start); + } + PMT_GPU_STOP("GPU_mat_mult_block", 0); + cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost); + + check(C_CPU, C_GPU_host); + printf("\n\t GPU naive block time %lg [s]\n", (time / LOOP)); + ///////////////////////////////////////////////////////////////////////////////////// + + /////////////////////////// GPU block shared memory algorithm /////////////////////// + GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up + cudaDeviceSynchronize(); + time = 0.0; + PMT_GPU_START("GPU_mat_mult_block_shared", 0); + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + const double start = wall_time(); + GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); + cudaDeviceSynchronize(); + time += (wall_time() - start); + } + PMT_GPU_STOP("GPU_mat_mult_block_shared", 0); + cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost); + + check(C_CPU, C_GPU_host); + printf("\n\t GPU block shared memory time %lg [s]\n", (time / LOOP)); + ///////////////////////////////////////////////////////////////////////////////////// + + // free CPU memory + free(buffer_cpu); + // free GPU memory + cudaFree(buffer_gpu); + + printf("\n"); + + PMT_CPU_SHOW("CPU_mat_mult_block"); + PMT_GPU_SHOW("GPU_mat_mult_block"); + PMT_GPU_SHOW("GPU_mat_mult_block_shared"); + + return EXIT_SUCCESS; +} diff --git a/cuda-omp/omp/energy/Makefile b/cuda-omp/omp/energy/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..56ef87ef1e4d251627b0ac6dad79d680582aaaa3 --- /dev/null +++ b/cuda-omp/omp/energy/Makefile @@ -0,0 +1,38 @@ +ENERGY ?= YES + +ifeq ($(ENERGY), YES) +OPTIONS += -D_ENERGY_PMT_ + +# enable RAPL +OPTIONS += -D_ENERGY_RAPL_ +# enable NVIDIA +OPTIONS += -D_ENERGY_NVIDIA_ +# enable AMD +OPTIONS += # -D_ENERGY_AMD_ +endif + +CC = gcc +CPP = g++ +NVC = nvc++ +OPT = -Wall -Wextra -O3 -fopenmp -march=native +INC = /home/darkenergy/software/pmt/include +LIB = /home/darkenergy/software/pmt/lib -lpmt -lm + +all: prog +.PHONY: clean test + +mat_mult_block.o: mat_mult_block.cu energy/energy_pmt.h Makefile + $(NVC) -O3 $(OPTIONS) $< -c $@ + +energy_pmt_methods.o: energy/energy_pmt_methods.cpp energy/energy_pmt_methods.h energy/energy_pmt.h Makefile + $(CPP) $(OPTIONS) -I$(INC) $< -c $@ + +mat_mult: mat_mult_block.o energy_pmt_methods.o + $(NVC) $(OPTIONS) $^ -o $@ -L$(LIB) + ldd mat_mult + +test: mat_mult + ./mat_mult + +clean: + rm -rf *.o mat_mult *~ energy/*~ energy/*.o diff --git a/cuda-omp/omp/energy/energy/energy_pmt.h b/cuda-omp/omp/energy/energy/energy_pmt.h new file mode 100644 index 0000000000000000000000000000000000000000..b59d1b14c647cd5898050474337a8dd60f9c1bd5 --- /dev/null +++ b/cuda-omp/omp/energy/energy/energy_pmt.h @@ -0,0 +1,30 @@ +#pragma once + +#include "energy_pmt_methods.h" + +#if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + #define PMT_CREATE(numGPUs) Create_PMT((numGPUs)) +#else + #define PMT_CREATE(numGPUs) +#endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + +#if defined(_ENERGY_RAPL_) + #define PMT_CPU_START(string) Start_PMT_CPU((string)) + #define PMT_CPU_STOP(string) Stop_PMT_CPU((string)) + #define PMT_CPU_SHOW(string) Show_PMT_CPU((string)) +#else + #define PMT_CPU_START(string) + #define PMT_CPU_STOP(string) + #define PMT_CPU_SHOW(string) +#endif // _ENERGY_RAPL_ + +#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + #define PMT_GPU_START(string) Start_PMT_GPU((string)) + #define PMT_GPU_STOP(string) Stop_PMT_GPU((string)) + #define PMT_GPU_SHOW(string) Show_PMT_GPU((string)) +#else + #define PMT_GPU_START(string) + #define PMT_GPU_STOP(string) + #define PMT_GPU_SHOW(string) +#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + diff --git a/cuda-omp/omp/energy/energy/energy_pmt_methods.cpp b/cuda-omp/omp/energy/energy/energy_pmt_methods.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a5863426e8dcf1b4dd21b0edfe6f638f2cb25306 --- /dev/null +++ b/cuda-omp/omp/energy/energy/energy_pmt_methods.cpp @@ -0,0 +1,157 @@ +#if defined(_ENERGY_PMT_) + +#include <pmt.h> +#include <assert.h> +#include <iostream> +#include <vector> +#include <string> +#include <map> + +struct EnergyState +{ + pmt::State start; + pmt::State stop; + double joules; + double watts; + double seconds; + unsigned int count; +}; + +static bool PMT_ERROR{false}; + +void PMT_err(void) +{ + std::cout << "\n\t PMT Error \n" << std::endl; + + return; +} + +#if defined(_ENERGY_RAPL_) + static std::unique_ptr<pmt::PMT> sensor_cpu; + static std::map<std::string, EnergyState> state_cpu; +#endif // _ENERGY_RAPL_ + +#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + static std::vector<std::unique_ptr<pmt::PMT>> sensor_gpu; + static std::vector<std::map<std::string, EnergyState>> state_gpu; +#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + +extern "C" +{ +#include "energy_pmt_methods.h" +} + +#if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) +void Create_PMT(const unsigned int numGPUs) +{ +#if defined(_ENERGY_RAPL_) + + sensor_cpu = pmt::Create("rapl"); + +#endif // _ENERGY_RAPL_ + + if (numGPUs > 0) + { + for (unsigned int dev=0 ; dev<numGPUs ; dev++) + { +#if defined(_ENERGY_NVIDIA_) + sensor_gpu.push_back(pmt::nvml::NVML::Create(dev)); +#endif // _ENERGY_NVIDIA_ + +#if defined(_ENERGY_AMD_) + sensor_gpu.push_back(pmt::rocm::AMD::Create(dev)); +#endif // _ENERGY_AMD_ + } // numGPUs + } + + return; +} +#endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + +#if defined(_ENERGY_RAPL_) +void Start_PMT_CPU(const char *label) +{ + if (PMT_ERROR) + { + PMT_err(); + return; + } + + // check if the label already exists + if (state_cpu.count(std::string{label})) + { + state_cpu[std::string{label}].start = sensor_cpu->Read(); + } + else + { + // insert the key and initialize the counters + state_cpu.insert({std::string{label}, + {sensor_cpu->Read(), + 0, + 0.0, 0.0, 0.0, 0 + }}); + } + + return; +} + +void Stop_PMT_CPU(const char *label) +{ + if (PMT_ERROR) + { + PMT_err(); + return; + } + + // check if the label already exists + // if not error + if (!state_cpu.count(std::string{label})) + { + PMT_ERROR = true; + PMT_err(); + return; + } + else + { + // read the counter + state_cpu[std::string{label}].stop = sensor_cpu->Read(); + + // update quantities + state_cpu[std::string{label}].seconds += + sensor_cpu->seconds(state_cpu[std::string{label}].start, + state_cpu[std::string{label}].stop); + + state_cpu[std::string{label}].joules += + sensor_cpu->joules(state_cpu[std::string{label}].start, + state_cpu[std::string{label}].stop); + + state_cpu[std::string{label}].watts += + sensor_cpu->watts(state_cpu[std::string{label}].start, + state_cpu[std::string{label}].stop); + + state_cpu[std::string{label}].count++; + } + + return; +} + +void Show_PMT_CPU(const char *label) +{ + if (PMT_ERROR || !state_cpu.count(std::string{label})) + { + PMT_err(); + return; + } + else + { + std::cout << "\n\t Kernel:" << std::string{label} << ":" << std::endl; + std::cout << "\t\t" << state_cpu[std::string{label}].seconds << " [S]" << std::endl; + std::cout << "\t\t" << state_cpu[std::string{label}].joules << " [J]" << std::endl; + std::cout << "\t\t" << state_cpu[std::string{label}].watts / state_cpu[std::string{label}].count << " [W]" << "\n" << std::endl; + } + + return; +} +#endif // _ENERGY_RAPL_ + +#endif // _ENERGY_PMT_ diff --git a/cuda-omp/omp/energy/energy/energy_pmt_methods.h b/cuda-omp/omp/energy/energy/energy_pmt_methods.h new file mode 100644 index 0000000000000000000000000000000000000000..a0479c644f11e93a4cb9d49a3826577d33bf1698 --- /dev/null +++ b/cuda-omp/omp/energy/energy/energy_pmt_methods.h @@ -0,0 +1,19 @@ +#pragma once + +#if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + void Create_PMT(const unsigned int numGPUs); +#endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + +#if defined(_ENERGY_RAPL_) + void Start_PMT_CPU(const char *string); + void Stop_PMT_CPU(const char *string); + void Show_PMT_CPU(const char *string); +#endif // _ENERGY_RAPL_ + +#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + void Start_PMT_GPU(const char *string); + void Stop_PMT_GPU(const char *string); + void Show_PMT_GPU(const char *string); +#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + + diff --git a/cuda-omp/omp/energy/multiple_devices.c b/cuda-omp/omp/energy/multiple_devices.c new file mode 100644 index 0000000000000000000000000000000000000000..c8212b8ac0afbd84837a7c0d11db061ed3c5a9ee --- /dev/null +++ b/cuda-omp/omp/energy/multiple_devices.c @@ -0,0 +1,146 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Offload to multiple devices within the same node: +// - using one OMP host-thread per device synchronously; +// - using one OMP host-thread asynchronously. +// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 27.08.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v multiple_devices.c -o multiple_devices_omp +// - Run the code: +// $ export OMP_TARGET_OFFLOAD=mandatory +// $ ./multiple_devices_omp +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#include <stdio.h> +#include <stdlib.h> +#include <omp.h> +#include <assert.h> + +typedef int MyData; +#define N_PER_DEV 1000000 +#define BLOCKSIZE 256 + +#if (BLOCKSIZE < 32) || (BLOCKSIZE > 1024) +#error "32 <= BLOCKSIZE <= 1024" +#endif + +#if (N_PER_DEV < BLOCKSIZE) +#error "N_PER_DEV < BLOCKSIZE" +#endif + +#define TRUE 1 +#define FALSE 0 + +#define NDEBUG + +void VectorAdd(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const int offset, + const int size, + const int dev, + const int nblocks, + const int async) +{ +#pragma omp target \ + teams num_teams(nblocks) thread_limit(BLOCKSIZE) \ + map(to: A[offset:size], B[offset:size]) map(from: C[offset:size]) \ + device(dev) + { + const int team = omp_get_team_num(); + const int team_start_index = (team * BLOCKSIZE) + offset; + const int team_end_index = team_start_index + BLOCKSIZE; + +#pragma omp parallel num_threads(BLOCKSIZE) + { + const int localID = omp_get_thread_num(); + const int block = omp_get_num_threads(); + + int globalID = team_start_index + localID; + + for (int index=globalID ; index<team_end_index ; index+=block) + C[index] = A[index] + B[index]; + +#if !defined(NDEBUG) + + if ((localID == 0) && (team == 0)) + printf("\n\t Device: %d - Teams: %d [requested: %d]- Thread per team: %d [requested: %d]", + dev, omp_get_num_teams(), nblocks, block, BLOCKSIZE); +#endif + } // omp parallel + } // omp target + + return; +} + +int main() +{ + // get the number of the available devices + const int NumDev = omp_get_num_devices(); + + const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE); + + // global vector size + const int size = (NumDev * N_PER_DEV); + assert(size > 0); + + MyData *buffer = (MyData *)malloc(4 * size * sizeof(MyData)); + assert(buffer != NULL); + MyData *const restrict A = buffer; + MyData *const restrict B = A + size; + MyData *const restrict C_CPU = B + size; + MyData *const restrict C_GPU = C_CPU + size; + + #pragma omp parallel for simd + for (int i=0 ; i<size ; i++) + { + A[i] = rand() % N_PER_DEV; + B[i] = rand() % N_PER_DEV; + C_CPU[i] = A[i] + B[i]; + } + + // each device is managed by a single OMP thread + #pragma omp parallel num_threads(NumDev) + { + // check + #pragma omp single + { + if (NumDev != omp_get_num_threads()) + exit(EXIT_FAILURE); + else + { + printf("\n\t Using %d GPUs \n", NumDev); + fflush(stdout); + } + } // implicit barrier + + const int tid = omp_get_thread_num(); + const int offset = (tid * N_PER_DEV); + + VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks, FALSE); + } // omp parallel + + check(C_CPU, C_GPU, size); + memset(C_GPU, 0, (size * sizeof(MyData))); + + // one OMP thread manages asynchronously all the devices + for (int dev=0 ; dev<NumDev ; dev++) + { + const int offset = (dev * N_PER_DEV); + VectorAdd(A, B, C_GPU, offset, N_PER_DEV, dev, nblocks, TRUE); + } + + // host-devices synchronization + #pragma omp taskwait + check(C_CPU, C_GPU, size); + + free(buffer); + + return 0; +}