diff --git a/cuda-omp/cuda/energy/Makefile b/cuda-omp/cuda/energy/Makefile index 1c5cde978ab092b36b2b2a50ee4e5c732a655822..f9e41f3817ea0954eee2c67afdb0f2ec98c88154 100644 --- a/cuda-omp/cuda/energy/Makefile +++ b/cuda-omp/cuda/energy/Makefile @@ -9,22 +9,36 @@ OPTIONS += -D_ENERGY_RAPL_ OPTIONS += -D_ENERGY_NVIDIA_ # enable AMD OPTIONS += # -D_ENERGY_AMD_ + +PMT = /leonardo/home/userexternal/dgoz0000/lib/pmt/local +# PMT = /home/darkenergy/software/pmt +INC = -I$(PMT)/include +LIB = -L$(PMT)/lib64 -lpmt -lm + endif -NVCPP = nvc++ +TOOLCHAIN ?= + +NVCPP = nvc++ -std=c++17 # --gcc-toolchain=$(TOOLCHAIN) OPT = -O3 -INC = /home/darkenergy/software/pmt/include -LIB = /home/darkenergy/software/pmt/lib -lpmt -lm +DEBUG = -O0 -g all: mat_mult -.PHONY: clean test +.PHONY: clean test debug mat_mult: mat_mult_block.cu energy/energy_pmt_methods.cpp energy/energy_pmt.h energy/energy_pmt_methods.h Makefile - $(NVCPP) $(OPT) $(OPTIONS) -I$(INC) mat_mult_block.cu energy/energy_pmt_methods.cpp -o $@ -L$(LIB) + $(NVCPP) $(OPT) $(OPTIONS) $(INC) mat_mult_block.cu energy/energy_pmt_methods.cpp -o $@ $(LIB) ldd ./mat_mult +mat_mult_debug: mat_mult_block.cu energy/energy_pmt_methods.cpp energy/energy_pmt.h energy/energy_pmt_methods.h Makefile + $(NVCPP) $(DEBUG) $(OPTIONS) $(INC) mat_mult_block.cu energy/energy_pmt_methods.cpp -o $@ $(LIB) + ldd ./mat_mult_debug + test: mat_mult ./mat_mult +debug: mat_mult_debug + cuda-gdb ./$< + clean: - rm -rf *.o mat_mult *~ energy/*~ energy/*.o + rm -rf *.o mat_mult mat_mult_debug *~ energy/*~ energy/*.o diff --git a/cuda-omp/cuda/energy/energy/energy_pmt.h b/cuda-omp/cuda/energy/energy/energy_pmt.h index 2204ca2d11eb194ec691526710689eeff221b217..8a477305f904485189f589f0c07fc12575478a5d 100644 --- a/cuda-omp/cuda/energy/energy/energy_pmt.h +++ b/cuda-omp/cuda/energy/energy/energy_pmt.h @@ -5,7 +5,7 @@ #if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) #define PMT_CREATE(devID, numGPUs) Create_PMT((devID), (numGPUs)) #else - #define PMT_CREATE(numGPUs) + #define PMT_CREATE(devID, numGPUs) #endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) #if defined(_ENERGY_RAPL_) @@ -23,8 +23,8 @@ #define PMT_GPU_STOP(string, devID) Stop_PMT_GPU((string), (devID)) #define PMT_GPU_SHOW(string) Show_PMT_GPU((string)) #else - #define PMT_GPU_START(string, dev) - #define PMT_GPU_STOP(string, dev) + #define PMT_GPU_START(string, devID) + #define PMT_GPU_STOP(string, devID) #define PMT_GPU_SHOW(string) #endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) diff --git a/cuda-omp/cuda/energy/energy/energy_pmt_methods.cpp b/cuda-omp/cuda/energy/energy/energy_pmt_methods.cpp index f41101d24385d703c45c5948e6cdb67ced827471..d7a82fa5ec77bac1f2026fb8963480a58e845089 100644 --- a/cuda-omp/cuda/energy/energy/energy_pmt_methods.cpp +++ b/cuda-omp/cuda/energy/energy/energy_pmt_methods.cpp @@ -49,7 +49,7 @@ void Create_PMT(int *devID, { #if defined(_ENERGY_RAPL_) - sensor_cpu = pmt::Create("rapl"); + sensor_cpu = pmt::rapl::Rapl::Create(); #endif // _ENERGY_RAPL_ @@ -61,9 +61,9 @@ void Create_PMT(int *devID, sensor_gpu.insert({devID[dev], #if defined(_ENERGY_NVIDIA_) - pmt::nvml::NVML::Create(dev)}); + pmt::nvml::NVML::Create(devID[dev])}); #elif defined(_ENERGY_AMD_) - pmt::rocm::AMD::Create(dev)}); + pmt::rocm::AMD::Create(devID[dev])}); #endif #endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) @@ -83,19 +83,25 @@ void Start_PMT_CPU(const char *label) return; } + const std::string tag{std::string{label}}; + // check if the label already exists - if (state_cpu.count(std::string{label})) + if (state_cpu.count(tag)) { - state_cpu[std::string{label}].start = sensor_cpu->Read(); + state_cpu[tag].start = sensor_cpu->Read(); } else { + // create new EnergyState + const EnergyState newState{sensor_cpu->Read(), + static_cast<pmt::State>(0), + static_cast<double>(0), + static_cast<double>(0), + static_cast<double>(0), + static_cast<unsigned int>(0)}; + // insert the key and initialize the counters - state_cpu.insert({std::string{label}, - {sensor_cpu->Read(), - 0, - 0.0, 0.0, 0.0, 0 - }}); + state_cpu.insert(std::pair<std::string, EnergyState>(tag, newState)); } return; @@ -108,10 +114,12 @@ void Stop_PMT_CPU(const char *label) PMT_err(); return; } + + const std::string tag{std::string{label}}; // check if the label already exists // if not error - if (!state_cpu.count(std::string{label})) + if (!state_cpu.count(tag)) { PMT_ERROR = true; PMT_err(); @@ -119,23 +127,20 @@ void Stop_PMT_CPU(const char *label) } else { + // get the energy state + EnergyState &State = state_cpu[tag]; + // read the counter - state_cpu[std::string{label}].stop = sensor_cpu->Read(); + State.stop = sensor_cpu->Read(); // update quantities - state_cpu[std::string{label}].seconds += - sensor_cpu->seconds(state_cpu[std::string{label}].start, - state_cpu[std::string{label}].stop); + State.seconds += sensor_cpu->seconds(State.start, State.stop); - state_cpu[std::string{label}].joules += - sensor_cpu->joules(state_cpu[std::string{label}].start, - state_cpu[std::string{label}].stop); + State.joules += sensor_cpu->joules(State.start, State.stop); - state_cpu[std::string{label}].watts += - sensor_cpu->watts(state_cpu[std::string{label}].start, - state_cpu[std::string{label}].stop); + State.watts += sensor_cpu->watts(State.start, State.stop); - state_cpu[std::string{label}].count++; + State.count++; } return; @@ -143,17 +148,19 @@ void Stop_PMT_CPU(const char *label) void Show_PMT_CPU(const char *label) { - if (PMT_ERROR || !state_cpu.count(std::string{label})) + const std::string tag{std::string{label}}; + + if (PMT_ERROR || !state_cpu.count(tag)) { PMT_err(); return; } else { - std::cout << "\n\t CPU Kernel:" << std::string{label} << ":" << std::endl; - std::cout << "\t\t" << state_cpu[std::string{label}].seconds << " [S]" << std::endl; - std::cout << "\t\t" << state_cpu[std::string{label}].joules << " [J]" << std::endl; - std::cout << "\t\t" << state_cpu[std::string{label}].watts / state_cpu[std::string{label}].count << " [W]" << "\n" << std::endl; + std::cout << "\n\t CPU Kernel:" << tag << ":" << std::endl; + std::cout << "\t\t" << state_cpu[tag].seconds << " [S]" << std::endl; + std::cout << "\t\t" << state_cpu[tag].joules << " [J]" << std::endl; + std::cout << "\t\t" << state_cpu[tag].watts / state_cpu[tag].count << " [W]" << "\n" << std::endl; } return; @@ -174,23 +181,28 @@ void Start_PMT_GPU(const char *label, if (!state_gpu.count(devID)) { // insert devID - state_gpu.insert({devID, {}}); + state_gpu.insert(std::pair<int, std::map<std::string, EnergyState>>(devID, {})); } + + const std::string tag{std::string{label}}; + // check if the label already exists - if (state_gpu[devID].count(std::string{label})) + if (state_gpu[devID].count(tag)) { // read the sensor - state_gpu[devID][std::string{label}].start = sensor_gpu[devID]->Read(); + state_gpu[devID][tag].start = sensor_gpu[devID]->Read(); } else { // insert the label and initialize the counters - state_gpu[devID].insert({std::string{label}, - { - sensor_gpu[devID]->Read(), - 0, - 0.0, 0.0, 0.0, 0 - }}); + const EnergyState newState{sensor_gpu[devID]->Read(), + static_cast<pmt::State>(0), + static_cast<double>(0), + static_cast<double>(0), + static_cast<double>(0), + static_cast<unsigned int>(0)}; + + state_gpu[devID].insert(std::pair<std::string, EnergyState>(tag, newState)); } return; @@ -209,9 +221,11 @@ void Stop_PMT_GPU(const char *label, } else { + const std::string tag{std::string{label}}; + // check if the label already exists // if not error - if (!state_gpu[devID].count(std::string{label})) + if (!state_gpu[devID].count(tag)) { PMT_ERROR = true; PMT_err(); @@ -219,23 +233,25 @@ void Stop_PMT_GPU(const char *label, } else { + EnergyState &State = state_gpu[devID][tag]; + // read the counter - state_gpu[devID][std::string{label}].stop = sensor_gpu[devID]->Read(); + State.stop = sensor_gpu[devID]->Read(); // update quantities - state_gpu[devID][std::string{label}].seconds += - sensor_gpu[devID]->seconds(state_gpu[devID][std::string{label}].start, - state_gpu[devID][std::string{label}].stop); + State.seconds += + sensor_gpu[devID]->seconds(State.start, + State.stop); - state_gpu[devID][std::string{label}].joules += - sensor_gpu[devID]->joules(state_gpu[devID][std::string{label}].start, - state_gpu[devID][std::string{label}].stop); + State.joules += + sensor_gpu[devID]->joules(State.start, + State.stop); - state_gpu[devID][std::string{label}].watts += - sensor_gpu[devID]->watts(state_gpu[devID][std::string{label}].start, - state_gpu[devID][std::string{label}].stop); + State.watts += + sensor_gpu[devID]->watts(State.start, + State.stop); - state_gpu[devID][std::string{label}].count++; + State.count++; } } @@ -251,13 +267,17 @@ void Show_PMT_GPU(const char *label) } else { - // show quantities for all devices - for (const auto& [key, value]: state_gpu) + const std::string tag{std::string{label}}; + + for (const auto &[key, value]: state_gpu) { - std::cout << "\n\t GPU [" << key << "] kernel:" << std::string{label} << ":" << std::endl; - std::cout << "\t\t" << value.at(std::string{label}).seconds << " [s]" << std::endl; - std::cout << "\t\t" << value.at(std::string{label}).joules << " [J]" << std::endl; - std::cout << "\t\t" << value.at(std::string{label}).watts / value.at(std::string{label}).count << " [W]" << "\n" << std::endl; + if (value.count(tag)) + { + std::cout << "\n\t GPU [" << key << "] kernel:" << tag << ":" << std::endl; + std::cout << "\t\t" << value.at(tag).seconds << " [s]" << std::endl; + std::cout << "\t\t" << value.at(tag).joules << " [J]" << std::endl; + std::cout << "\t\t" << value.at(tag).watts / value.at(tag).count << " [W]" << "\n" << std::endl; + } } } diff --git a/cuda-omp/cuda/energy/mat_mult_block.cu b/cuda-omp/cuda/energy/mat_mult_block.cu index 5e47928c3a6de1b58567d847537909743bc2bc75..112b91c819d1b386ac1bb8eca6858b2fab31fc57 100644 --- a/cuda-omp/cuda/energy/mat_mult_block.cu +++ b/cuda-omp/cuda/energy/mat_mult_block.cu @@ -39,6 +39,7 @@ ////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> +#include <string.h> #include <stdlib.h> #include <unistd.h> #include <time.h> @@ -51,20 +52,21 @@ #include "energy/energy_pmt.h" #define N 1024 -#define SIZE (N * N) // matrix size +#define SIZE ((N) * (N)) // matrix size typedef double MyData; // do not change -#define BLOCK 16 +#define BLOCK_ 16 +#define BLOCKSIZE ((BLOCK_) * (BLOCK_)) // sanity check -#if BLOCK * BLOCK > 1024 +#if BLOCKSIZE > 1024 #error BLOCKSIZE must be <= 1024 #endif -#if BLOCK * BLOCK > SIZE +#if BLOCKSIZE > SIZE #error BLOCKSIZE must be <= SIZE #endif -#define LOOP 100 +#define LOOP 3 #define NDEBUG double wall_time() @@ -94,20 +96,20 @@ void CPU_mat_mult_block(const MyData *const __restrict__ A, MyData *const __restrict__ C, const size_t size) { - const size_t Nblocks = (size / BLOCK); + const size_t Nblocks = (size / BLOCK_); // loop over blocks of matrix C for (size_t ib=0 ; ib<Nblocks ; ib++) { for (size_t jb=0 ; jb<Nblocks ; jb++) { - // loop over blocks of rows of A + // loop over blocks of rows of A for (size_t kb=0 ; kb<Nblocks ; kb++) { // Matrix multiplication within a block - for (size_t i=(ib * BLOCK) ; i<((ib + 1) * BLOCK) ; i++) - for (size_t j=(jb * BLOCK) ; j<((jb + 1) * BLOCK) ; j++) - for (size_t k=(kb * BLOCK) ; k<((kb + 1) * BLOCK) ; k++) + for (size_t i=(ib * BLOCK_) ; i<((ib + 1) * BLOCK_) ; i++) + for (size_t j=(jb * BLOCK_) ; j<((jb + 1) * BLOCK_) ; j++) + for (size_t k=(kb * BLOCK_) ; k<((kb + 1) * BLOCK_) ; k++) C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; } // kb } // jb @@ -127,25 +129,25 @@ __global__ void GPU_mat_mult_block(const MyData *const __restrict__ A, if (globalID >= size2) return; - const size_t Nblocks = size / BLOCK; // number of blocks to loop over A and B - const size_t ib = blockIdx.x / Nblocks; // indexes of starting matrix's block + const size_t Nblocks = size / BLOCK_; // number of blocks to loop over A and B + const size_t ib = blockIdx.x / Nblocks; // indexes of starting matrix's block const size_t jb = blockIdx.x % Nblocks; - const size_t i_local = threadIdx.x / BLOCK; // local matrix's indexes mapped to each CUDA thread - const size_t j_local = threadIdx.x % BLOCK; // within its own block - const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread - const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices + const size_t i_local = threadIdx.x / BLOCK_; // local matrix's indexes mapped to each CUDA thread + const size_t j_local = threadIdx.x % BLOCK_; // within its own block + const size_t i_global = i_local + (ib * BLOCK_); // global matrix's indexes mapped to each CUDA thread + const size_t j_global = j_local + (jb * BLOCK_); // N.B. uncoalescent memory accesses to A and B matrices C[(i_global * size) + j_global] = (MyData)0; // loop over blocks for (size_t block=0 ; block<Nblocks ; block++) { - const size_t j_A = (block * BLOCK); - const size_t i_B = (block * BLOCK); + const size_t j_A = (block * BLOCK_); + const size_t i_B = (block * BLOCK_); // perform the matrix multiplication within the block MyData value = (MyData)0; - for (size_t k=0 ; k<BLOCK ; k++) + for (size_t k=0 ; k<BLOCK_ ; k++) value += A[(i_global * size) + k + j_A] * B[((k + i_B) * size) + j_global]; C[(i_global * size) + j_global] += value; @@ -159,54 +161,54 @@ __global__ void GPU_mat_mult_block_shared(const MyData *const __restrict__ A, MyData *const __restrict__ C, const size_t size) { - __shared__ MyData Ablock[BLOCK * BLOCK]; - __shared__ MyData Bblock[BLOCK * BLOCK]; - __shared__ MyData Cblock[BLOCK * BLOCK]; + __shared__ MyData Ablock[BLOCKSIZE]; + __shared__ MyData Bblock[BLOCKSIZE]; + __shared__ MyData Cblock[BLOCKSIZE]; const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x); const size_t size2 = (size * size); if (globalID >= size2) return; - const size_t Nblocks = size / BLOCK; // number of blocks to loop over - const size_t ib = blockIdx.x / Nblocks; // indexes of starting matrix's block + const size_t Nblocks = size / BLOCK_; // number of blocks to loop over + const size_t ib = blockIdx.x / Nblocks; // indexes of starting matrix's block const size_t jb = blockIdx.x % Nblocks; - const size_t i_local = threadIdx.x / BLOCK; // local matrix's indexes mapped to each CUDA thread - const size_t j_local = threadIdx.x % BLOCK; // within its own block - const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread - const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices + const size_t i_local = threadIdx.x / BLOCK_; // local matrix's indexes mapped to each CUDA thread + const size_t j_local = threadIdx.x % BLOCK_; // within its own block + const size_t i_global = i_local + (ib * BLOCK_); // global matrix's indexes mapped to each CUDA thread + const size_t j_global = j_local + (jb * BLOCK_); // N.B. uncoalescent memory accesses to A and B matrices // Init Cblock - Cblock[(i_local * BLOCK) + j_local] = (MyData)0; + Cblock[(i_local * BLOCK_) + j_local] = (MyData)0; // loop over blocks for (size_t block=0 ; block<Nblocks ; block++) { - const size_t j_A = (block * BLOCK); - const size_t i_B = (block * BLOCK); + const size_t j_A = (block * BLOCK_); + const size_t i_B = (block * BLOCK_); // waits until all threads in the thread block have reached this point and shared memory accesses // made by these threads prior to __syncthreads() are visible to all threads in the block. __syncthreads(); // copy block of A into shared memory - Ablock[(i_local * BLOCK) + j_local] = A[(i_global * size) + j_local + j_A]; + Ablock[(i_local * BLOCK_) + j_local] = A[(i_global * size) + j_local + j_A]; // copy block of B into shared memory - Bblock[(i_local * BLOCK) + j_local] = B[((i_local + i_B) * size) + j_global]; + Bblock[(i_local * BLOCK_) + j_local] = B[((i_local + i_B) * size) + j_global]; // waits until all threads in the thread block have reached this point and shared memory accesses // made by these threads prior to __syncthreads() are visible to all threads in the block. __syncthreads(); // perform the matrix multiplication within the block MyData value = (MyData)0; - for (size_t k=0 ; k<BLOCK ; k++) - value += Ablock[(i_local * BLOCK) + k] * Bblock[(k * BLOCK) + j_local]; + for (size_t k=0 ; k<BLOCK_ ; k++) + value += Ablock[(i_local * BLOCK_) + k] * Bblock[(k * BLOCK_) + j_local]; // store the partial result in shared memory - Cblock[(i_local * BLOCK) + j_local] += value; + Cblock[(i_local * BLOCK_) + j_local] += value; } - C[(i_global * size) + j_global] = Cblock[(i_local * BLOCK) + j_local]; + C[(i_global * size) + j_global] = Cblock[(i_local * BLOCK_) + j_local]; return; } @@ -257,9 +259,13 @@ int main() PMT_CREATE(&devID, 1); ////////////////////////// CPU naive algorithm ////////////////////////////////////////// - PMT_CPU_START("CPU_mat_mult_block"); - CPU_mat_mult_block(A_CPU, B_CPU, C_CPU, N); - PMT_CPU_STOP("CPU_mat_mult_block"); + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + PMT_CPU_START("CPU_mat_mult_block"); + memset(C_CPU, 0, SIZE * sizeof(MyData)); + CPU_mat_mult_block(A_CPU, B_CPU, C_CPU, N); + PMT_CPU_STOP("CPU_mat_mult_block"); + } ///////////////////////////////////////////////////////////////////////////////////////// // copy/alloc data to the GPU @@ -270,22 +276,22 @@ int main() MyData *const C_GPU = B_GPU + SIZE; cudaMemcpy(A_GPU, A_CPU, (2 * SIZE * sizeof(MyData)), cudaMemcpyHostToDevice); - const dim3 nblocks = {(SIZE + (BLOCK * BLOCK) -1)/(BLOCK * BLOCK), 1, 1}; - const dim3 block = {(BLOCK * BLOCK), 1, 1}; + const unsigned int nblocks = ((SIZE + (BLOCKSIZE) - 1) / BLOCKSIZE); + const unsigned int block = BLOCKSIZE; /////////////////////////// GPU naive block algorithm //////////////////////////////////////// GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up cudaDeviceSynchronize(); time = 0.0; - PMT_GPU_START("GPU_mat_mult_block", 0); for (unsigned short int loop=0 ; loop<LOOP ; loop++) { + PMT_GPU_START("GPU_mat_mult_block", 0); const double start = wall_time(); GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); cudaDeviceSynchronize(); time += (wall_time() - start); + PMT_GPU_STOP("GPU_mat_mult_block", 0); } - PMT_GPU_STOP("GPU_mat_mult_block", 0); cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost); check(C_CPU, C_GPU_host); @@ -296,15 +302,15 @@ int main() GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up cudaDeviceSynchronize(); time = 0.0; - PMT_GPU_START("GPU_mat_mult_block_shared", 0); for (unsigned short int loop=0 ; loop<LOOP ; loop++) { + PMT_GPU_START("GPU_mat_mult_block_shared", 0); const double start = wall_time(); GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); cudaDeviceSynchronize(); time += (wall_time() - start); + PMT_GPU_STOP("GPU_mat_mult_block_shared", 0); } - PMT_GPU_STOP("GPU_mat_mult_block_shared", 0); cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost); check(C_CPU, C_GPU_host); diff --git a/cuda-omp/cuda/energy/mat_mult_debug b/cuda-omp/cuda/energy/mat_mult_debug new file mode 100755 index 0000000000000000000000000000000000000000..9f6ea1f82cd4464f308f32796057caf125b5db37 Binary files /dev/null and b/cuda-omp/cuda/energy/mat_mult_debug differ diff --git a/cuda-omp/omp/energy/Makefile b/cuda-omp/omp/energy/Makefile index 56ef87ef1e4d251627b0ac6dad79d680582aaaa3..5e592b6bb2750e14624d3542da298ea45a5adb9f 100644 --- a/cuda-omp/omp/energy/Makefile +++ b/cuda-omp/omp/energy/Makefile @@ -9,30 +9,32 @@ OPTIONS += -D_ENERGY_RAPL_ OPTIONS += -D_ENERGY_NVIDIA_ # enable AMD OPTIONS += # -D_ENERGY_AMD_ + +PMT = /leonardo/home/userexternal/dgoz0000/lib/pmt/local +INC = -I$(PMT)/include +LIB = -L$(PMT)/lib64 -lpmt -lm + endif -CC = gcc -CPP = g++ -NVC = nvc++ -OPT = -Wall -Wextra -O3 -fopenmp -march=native -INC = /home/darkenergy/software/pmt/include -LIB = /home/darkenergy/software/pmt/lib -lpmt -lm +NVC = nvc +NVCPP = nvc++ -std=c++17 +CPP = g++ -std=c++17 +OPT = -O3 -mp=gpu,multicore -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v -all: prog +all: multiple_devices .PHONY: clean test -mat_mult_block.o: mat_mult_block.cu energy/energy_pmt.h Makefile - $(NVC) -O3 $(OPTIONS) $< -c $@ +energy_pmt_methods.o: energy/energy_pmt_methods.cpp energy/energy_pmt.h energy/energy_pmt_methods.h Makefile + $(CPP) $(OPTIONS) $(INC) -c $< -o $@ -energy_pmt_methods.o: energy/energy_pmt_methods.cpp energy/energy_pmt_methods.h energy/energy_pmt.h Makefile - $(CPP) $(OPTIONS) -I$(INC) $< -c $@ +multiple_devices.o: multiple_devices.c energy/energy_pmt.h Makefile + $(NVC) $(OPT) $(OPTIONS) -c $< -o $@ -mat_mult: mat_mult_block.o energy_pmt_methods.o - $(NVC) $(OPTIONS) $^ -o $@ -L$(LIB) - ldd mat_mult +multiple_devices: multiple_devices.o energy_pmt_methods.o + $(NVCPP) $(OPT) $(OPTIONS) $^ -o $@ $(LIB) -test: mat_mult - ./mat_mult +test: multiple_devices + ./multiple_devices clean: - rm -rf *.o mat_mult *~ energy/*~ energy/*.o + rm -rf *.o multiple_devices *~ ./energy/*~ diff --git a/cuda-omp/omp/energy/energy/energy_pmt.h b/cuda-omp/omp/energy/energy/energy_pmt.h index b59d1b14c647cd5898050474337a8dd60f9c1bd5..adf05e03ae38782ec2ef5f090473d3340a3002f6 100644 --- a/cuda-omp/omp/energy/energy/energy_pmt.h +++ b/cuda-omp/omp/energy/energy/energy_pmt.h @@ -3,9 +3,9 @@ #include "energy_pmt_methods.h" #if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) - #define PMT_CREATE(numGPUs) Create_PMT((numGPUs)) + #define PMT_CREATE(devID, numGPUs) Create_PMT((devID), (numGPUs)) #else - #define PMT_CREATE(numGPUs) + #define PMT_CREATE(devID, numGPUs) #endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) #if defined(_ENERGY_RAPL_) @@ -19,12 +19,11 @@ #endif // _ENERGY_RAPL_ #if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) - #define PMT_GPU_START(string) Start_PMT_GPU((string)) - #define PMT_GPU_STOP(string) Stop_PMT_GPU((string)) - #define PMT_GPU_SHOW(string) Show_PMT_GPU((string)) + #define PMT_GPU_START(string, devID) Start_PMT_GPU((string), (devID)) + #define PMT_GPU_STOP(string, devID) Stop_PMT_GPU((string), (devID)) + #define PMT_GPU_SHOW(string) Show_PMT_GPU((string)) #else - #define PMT_GPU_START(string) - #define PMT_GPU_STOP(string) + #define PMT_GPU_START(string, devID) + #define PMT_GPU_STOP(string, devID) #define PMT_GPU_SHOW(string) #endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) - diff --git a/cuda-omp/omp/energy/energy/energy_pmt_methods.cpp b/cuda-omp/omp/energy/energy/energy_pmt_methods.cpp index a5863426e8dcf1b4dd21b0edfe6f638f2cb25306..d7a82fa5ec77bac1f2026fb8963480a58e845089 100644 --- a/cuda-omp/omp/energy/energy/energy_pmt_methods.cpp +++ b/cuda-omp/omp/energy/energy/energy_pmt_methods.cpp @@ -32,35 +32,41 @@ void PMT_err(void) #endif // _ENERGY_RAPL_ #if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) - static std::vector<std::unique_ptr<pmt::PMT>> sensor_gpu; - static std::vector<std::map<std::string, EnergyState>> state_gpu; + static std::map<int, std::unique_ptr<pmt::PMT>> sensor_gpu; + static std::map<int, std::map<std::string, EnergyState>> state_gpu; #endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) -extern "C" -{ -#include "energy_pmt_methods.h" -} +#if !defined(__NVCC__) && !defined(__NVCOMPILER) + extern "C" + { + #include "energy_pmt_methods.h" + } +#endif // !defined(__NVCC__) && !defined(__NVCOMPILER) #if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) -void Create_PMT(const unsigned int numGPUs) +void Create_PMT(int *devID, + const unsigned int numGPUs) { #if defined(_ENERGY_RAPL_) - - sensor_cpu = pmt::Create("rapl"); + + sensor_cpu = pmt::rapl::Rapl::Create(); #endif // _ENERGY_RAPL_ - if (numGPUs > 0) + if ((numGPUs > 0) && (devID != nullptr)) { for (unsigned int dev=0 ; dev<numGPUs ; dev++) { +#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + sensor_gpu.insert({devID[dev], + #if defined(_ENERGY_NVIDIA_) - sensor_gpu.push_back(pmt::nvml::NVML::Create(dev)); -#endif // _ENERGY_NVIDIA_ + pmt::nvml::NVML::Create(devID[dev])}); +#elif defined(_ENERGY_AMD_) + pmt::rocm::AMD::Create(devID[dev])}); +#endif -#if defined(_ENERGY_AMD_) - sensor_gpu.push_back(pmt::rocm::AMD::Create(dev)); -#endif // _ENERGY_AMD_ +#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) } // numGPUs } @@ -77,19 +83,25 @@ void Start_PMT_CPU(const char *label) return; } + const std::string tag{std::string{label}}; + // check if the label already exists - if (state_cpu.count(std::string{label})) + if (state_cpu.count(tag)) { - state_cpu[std::string{label}].start = sensor_cpu->Read(); + state_cpu[tag].start = sensor_cpu->Read(); } else { + // create new EnergyState + const EnergyState newState{sensor_cpu->Read(), + static_cast<pmt::State>(0), + static_cast<double>(0), + static_cast<double>(0), + static_cast<double>(0), + static_cast<unsigned int>(0)}; + // insert the key and initialize the counters - state_cpu.insert({std::string{label}, - {sensor_cpu->Read(), - 0, - 0.0, 0.0, 0.0, 0 - }}); + state_cpu.insert(std::pair<std::string, EnergyState>(tag, newState)); } return; @@ -102,10 +114,12 @@ void Stop_PMT_CPU(const char *label) PMT_err(); return; } + + const std::string tag{std::string{label}}; // check if the label already exists // if not error - if (!state_cpu.count(std::string{label})) + if (!state_cpu.count(tag)) { PMT_ERROR = true; PMT_err(); @@ -113,23 +127,20 @@ void Stop_PMT_CPU(const char *label) } else { + // get the energy state + EnergyState &State = state_cpu[tag]; + // read the counter - state_cpu[std::string{label}].stop = sensor_cpu->Read(); + State.stop = sensor_cpu->Read(); // update quantities - state_cpu[std::string{label}].seconds += - sensor_cpu->seconds(state_cpu[std::string{label}].start, - state_cpu[std::string{label}].stop); + State.seconds += sensor_cpu->seconds(State.start, State.stop); - state_cpu[std::string{label}].joules += - sensor_cpu->joules(state_cpu[std::string{label}].start, - state_cpu[std::string{label}].stop); + State.joules += sensor_cpu->joules(State.start, State.stop); - state_cpu[std::string{label}].watts += - sensor_cpu->watts(state_cpu[std::string{label}].start, - state_cpu[std::string{label}].stop); + State.watts += sensor_cpu->watts(State.start, State.stop); - state_cpu[std::string{label}].count++; + State.count++; } return; @@ -137,21 +148,142 @@ void Stop_PMT_CPU(const char *label) void Show_PMT_CPU(const char *label) { - if (PMT_ERROR || !state_cpu.count(std::string{label})) + const std::string tag{std::string{label}}; + + if (PMT_ERROR || !state_cpu.count(tag)) { PMT_err(); return; } else { - std::cout << "\n\t Kernel:" << std::string{label} << ":" << std::endl; - std::cout << "\t\t" << state_cpu[std::string{label}].seconds << " [S]" << std::endl; - std::cout << "\t\t" << state_cpu[std::string{label}].joules << " [J]" << std::endl; - std::cout << "\t\t" << state_cpu[std::string{label}].watts / state_cpu[std::string{label}].count << " [W]" << "\n" << std::endl; + std::cout << "\n\t CPU Kernel:" << tag << ":" << std::endl; + std::cout << "\t\t" << state_cpu[tag].seconds << " [S]" << std::endl; + std::cout << "\t\t" << state_cpu[tag].joules << " [J]" << std::endl; + std::cout << "\t\t" << state_cpu[tag].watts / state_cpu[tag].count << " [W]" << "\n" << std::endl; } return; } #endif // _ENERGY_RAPL_ +#if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) +void Start_PMT_GPU(const char *label, + const int devID) +{ + if (PMT_ERROR || !sensor_gpu.count(devID)) + { + PMT_err(); + return; + } + + // check if the devID already exists + if (!state_gpu.count(devID)) + { + // insert devID + state_gpu.insert(std::pair<int, std::map<std::string, EnergyState>>(devID, {})); + } + + const std::string tag{std::string{label}}; + + // check if the label already exists + if (state_gpu[devID].count(tag)) + { + // read the sensor + state_gpu[devID][tag].start = sensor_gpu[devID]->Read(); + } + else + { + // insert the label and initialize the counters + const EnergyState newState{sensor_gpu[devID]->Read(), + static_cast<pmt::State>(0), + static_cast<double>(0), + static_cast<double>(0), + static_cast<double>(0), + static_cast<unsigned int>(0)}; + + state_gpu[devID].insert(std::pair<std::string, EnergyState>(tag, newState)); + } + + return; +} + +void Stop_PMT_GPU(const char *label, + const int devID) +{ + // check if the devID already exists + // if not error + if (!state_gpu.count(devID) || PMT_ERROR || !sensor_gpu.count(devID)) + { + PMT_ERROR = true; + PMT_err(); + return; + } + else + { + const std::string tag{std::string{label}}; + + // check if the label already exists + // if not error + if (!state_gpu[devID].count(tag)) + { + PMT_ERROR = true; + PMT_err(); + return; + } + else + { + EnergyState &State = state_gpu[devID][tag]; + + // read the counter + State.stop = sensor_gpu[devID]->Read(); + + // update quantities + State.seconds += + sensor_gpu[devID]->seconds(State.start, + State.stop); + + State.joules += + sensor_gpu[devID]->joules(State.start, + State.stop); + + State.watts += + sensor_gpu[devID]->watts(State.start, + State.stop); + + State.count++; + } + } + + return; +} + +void Show_PMT_GPU(const char *label) +{ + if (PMT_ERROR) + { + PMT_err(); + return; + } + else + { + const std::string tag{std::string{label}}; + + for (const auto &[key, value]: state_gpu) + { + if (value.count(tag)) + { + std::cout << "\n\t GPU [" << key << "] kernel:" << tag << ":" << std::endl; + std::cout << "\t\t" << value.at(tag).seconds << " [s]" << std::endl; + std::cout << "\t\t" << value.at(tag).joules << " [J]" << std::endl; + std::cout << "\t\t" << value.at(tag).watts / value.at(tag).count << " [W]" << "\n" << std::endl; + } + } + } + + return; +} + +#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) + #endif // _ENERGY_PMT_ diff --git a/cuda-omp/omp/energy/energy/energy_pmt_methods.h b/cuda-omp/omp/energy/energy/energy_pmt_methods.h index a0479c644f11e93a4cb9d49a3826577d33bf1698..6b49d915972463850e076ab3aa063a7caa11174e 100644 --- a/cuda-omp/omp/energy/energy/energy_pmt_methods.h +++ b/cuda-omp/omp/energy/energy/energy_pmt_methods.h @@ -1,7 +1,7 @@ #pragma once #if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) - void Create_PMT(const unsigned int numGPUs); + void Create_PMT(int *devID, const unsigned int numGPUs); #endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) #if defined(_ENERGY_RAPL_) @@ -11,8 +11,8 @@ #endif // _ENERGY_RAPL_ #if defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) - void Start_PMT_GPU(const char *string); - void Stop_PMT_GPU(const char *string); + void Start_PMT_GPU(const char *string, const int); + void Stop_PMT_GPU(const char *string, const int); void Show_PMT_GPU(const char *string); #endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_) diff --git a/cuda-omp/omp/energy/multiple_devices.c b/cuda-omp/omp/energy/multiple_devices.c index c8212b8ac0afbd84837a7c0d11db061ed3c5a9ee..f99dffd73146ba9dbb61e20b1b2c2b23e1181c5e 100644 --- a/cuda-omp/omp/energy/multiple_devices.c +++ b/cuda-omp/omp/energy/multiple_devices.c @@ -16,11 +16,12 @@ // $ ./multiple_devices_omp //////////////////////////////////////////////////////////////////////////////////////////////////// - +#include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <omp.h> #include <assert.h> +#include "energy/energy_pmt.h" typedef int MyData; #define N_PER_DEV 1000000 @@ -39,6 +40,23 @@ typedef int MyData; #define NDEBUG +void check(const MyData *const restrict A, + const MyData *const restrict B, + const int size) +{ + int flag = 0; + + for (int i=0 ; i<size ; i++) + flag = ((A[i] != B[i]) ? 1 : flag); + + if (flag) + printf("\n\t Result wrong \n"); + else + printf("\n\t Result OK \n"); + + return; +} + void VectorAdd(const MyData *const restrict A, const MyData *const restrict B, MyData *const restrict C, @@ -84,6 +102,12 @@ int main() // get the number of the available devices const int NumDev = omp_get_num_devices(); + int devID[NumDev]; + for (int i=0 ; i<NumDev ; i++) + devID[i] = i; + + PMT_CREATE(devID, NumDev); + const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE); // global vector size @@ -97,6 +121,7 @@ int main() MyData *const restrict C_CPU = B + size; MyData *const restrict C_GPU = C_CPU + size; + PMT_CPU_START("HostVecAdd"); #pragma omp parallel for simd for (int i=0 ; i<size ; i++) { @@ -104,6 +129,7 @@ int main() B[i] = rand() % N_PER_DEV; C_CPU[i] = A[i] + B[i]; } + PMT_CPU_STOP("HostVecAdd"); // each device is managed by a single OMP thread #pragma omp parallel num_threads(NumDev) @@ -133,7 +159,12 @@ int main() for (int dev=0 ; dev<NumDev ; dev++) { const int offset = (dev * N_PER_DEV); + + PMT_GPU_START("GpuVecAdd", dev); + VectorAdd(A, B, C_GPU, offset, N_PER_DEV, dev, nblocks, TRUE); + + PMT_GPU_STOP("GpuVecAdd", dev); } // host-devices synchronization @@ -141,6 +172,9 @@ int main() check(C_CPU, C_GPU, size); free(buffer); + + PMT_CPU_SHOW("HostVecAdd"); + PMT_GPU_SHOW("GpuVecAdd"); return 0; } diff --git a/cuda-omp/omp/miscellaneous/multiple_devices.c b/cuda-omp/omp/miscellaneous/multiple_devices.c index c8212b8ac0afbd84837a7c0d11db061ed3c5a9ee..9f7121b15671d5db2c40a245f2a29528895802e3 100644 --- a/cuda-omp/omp/miscellaneous/multiple_devices.c +++ b/cuda-omp/omp/miscellaneous/multiple_devices.c @@ -126,7 +126,7 @@ int main() VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks, FALSE); } // omp parallel - check(C_CPU, C_GPU, size); + // check(C_CPU, C_GPU, size); memset(C_GPU, 0, (size * sizeof(MyData))); // one OMP thread manages asynchronously all the devices @@ -138,7 +138,7 @@ int main() // host-devices synchronization #pragma omp taskwait - check(C_CPU, C_GPU, size); + // check(C_CPU, C_GPU, size); free(buffer);