diff --git a/cuda-omp/cuda/1/classwork_1.cu b/cuda-omp/cuda/1/classwork_1.cu index 446b30002a646dcef0603b77b6bb81d2a17ca83c..5fe7a232349caa913fc4b590c025e54778b6af5d 100644 --- a/cuda-omp/cuda/1/classwork_1.cu +++ b/cuda-omp/cuda/1/classwork_1.cu @@ -17,9 +17,10 @@ // Author: David Goz // mail : david.goz@inaf.it // date : 06.07.2024 +// code tested using nvhpc // // - Compile the code: -// $ nvcc classwork_1.cu -o classwork_1_cuda +// $ nvc++ classwork_1.cu -o classwork_1_cuda // - Run the code: // $ ./classwork_1_cuda // - Check the result: @@ -33,10 +34,23 @@ #define N 100 #define NThreads 1024 -__global__ void GPUkernel(const int size) +__global__ void GPUkernelSerial(const int size) { const int myID = threadIdx.x + (blockIdx.x * blockDim.x); - + + // C printf is supported on CUDA + // C++ cout class is not supported in CUDA + for (int i=0 ; i<size ; i++) + printf("Hello from CUDA thread: %d - result %d\n", myID, (i * i)); + + return; +} + +__global__ void GPUkernelParallel(const int size) +{ + const int myID = threadIdx.x + (blockIdx.x * blockDim.x); + + /* guard if the number of available threads is larger than size */ if (myID >= size) return; @@ -49,14 +63,17 @@ __global__ void GPUkernel(const int size) int main() { - printf("\n\t The host issues the kernel on the GPU \n"); - + printf("\n\t The host issues the kernel on the GPU in serial \n"); // kernel lunch - GPUkernel<<<1, NThreads>>>(N); - - printf("\n\t cudaDeviceSynchronize \n"); + GPUkernelSerial<<<1, 1>>>(N); // GPU synchronization cudaDeviceSynchronize(); + printf("\n\t The host issues the kernel on the GPU in parallel \n"); + // kernel lunch + GPUkernelParallel<<<1, NThreads>>>(N); + // GPU synchronization + cudaDeviceSynchronize(); + return 0; } diff --git a/cuda-omp/cuda/1/classwork_2.cu b/cuda-omp/cuda/1/classwork_2.cu index cae80988170dbcdbf3468ad0b93d5777397d8ba2..c086bad2bf994f0b24861156cb28e801a75bcb44 100644 --- a/cuda-omp/cuda/1/classwork_2.cu +++ b/cuda-omp/cuda/1/classwork_2.cu @@ -18,10 +18,11 @@ ////////////////////////////////////////////////////////////////////////////////////////////////// // Author: David Goz // mail : david.goz@inaf.it -// date : 17.11.2022 +// date : 06.07.2024 +// code tested using nvhpc // // - Compile the code: -// $ nvcc classwork_2.cu -o classwork_2 +// $ nvc++ classwork_2.cu -o classwork_2 // - Run the code: // $ ./classwork_2 // - Check the result: @@ -29,9 +30,10 @@ ////////////////////////////////////////////////////////////////////////////////////////////////// -#include <iostream> #include <stdlib.h> +#include <stdio.h> #include <cuda.h> +#include <assert.h> #define N 100 #define NThreads 1024 @@ -49,31 +51,35 @@ __global__ void GPUkernel( int *A, int main() { - // allocate array that allows direct access of both host and device - // CUDA is responsible - int *A; - const size_t size = (N * sizeof(int)); - cudaError error = cudaMallocManaged(&A, size); - if (!error) - std::cout << "Memory allocated for the host/device" << std::endl; - else - { - std::cout << "Cannot allocate memory for the host/device. CUDA error : " << error << " ... aborting" << std::endl; - exit(EXIT_FAILURE); - } + /* host array */ + int *h_A = (int *)malloc(N * sizeof(*h_A)); + assert(h_A != NULL); + + /* device array */ + int *d_A = NULL; + cudaMalloc((void **)&d_A, (N * sizeof(*d_A))); + assert(d_A != NULL); // kernel lunch - GPUkernel<<<1, NThreads>>>(A, N); + GPUkernel<<<1, NThreads>>>(d_A, N); // device synchronization cudaDeviceSynchronize(); + + // fetch device memory + cudaMemcpy(h_A, d_A, (sizeof(*h_A) * N), cudaMemcpyDeviceToHost); + + // free GPU memory + cudaFree(d_A); // check the result + printf("\n"); for (size_t i=0 ; i<N ; i++) - std::cout << "A[" << i << "] - Result: " << A[i] << std::endl; + printf("\t A[%d] = %d", i, h_A[i]); + printf("\n\n"); - // free the memory - cudaFree(A); + // free host memory + free(h_A); return 0; } diff --git a/cuda-omp/cuda/2/classwork.cu b/cuda-omp/cuda/2/classwork.cu new file mode 100644 index 0000000000000000000000000000000000000000..352a730f6cc7f206155c206fc8e97faf22bf8105 --- /dev/null +++ b/cuda-omp/cuda/2/classwork.cu @@ -0,0 +1,107 @@ +////////////////////////////////////////////////////////////////////////////////////////////////// +// Assigment : write a CUDA code corresponding to the +// following sequential C code +// +// #include <stdio.h> +// #define ROW 4 +// #define COL 8 +// int main() +// { +// int Matrix[ROW * COL]; +// +// for (int row=0 ; row<ROW ; row++) +// for (int col=0 ; col<COL ; col++) +// Matrix[(row * COL) + col] = ((row * COL) + col); +// +// return 0; +// } +////////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 06.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc++ classwork.cu -o classwork +// - Run the code: +// $ ./classwork +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <iostream> +#include <stdlib.h> +#include <cuda.h> +#include <assert.h> + +#define BLOCKSIZE 32 +#define ROW 1089 +#define COL 1111 + +typedef int MyData; + +__global__ void GPUMatrix(MyData *Matrix, + const int size) +{ + /* global thread index */ + const int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < size) + Matrix[index] = index; + + return; +} + +int main() +{ + /* host matrix . 1D mapping matrix[i][j] ---> matrix[(i * COL) + j] */ + MyData *h_matrix = (MyData *)malloc(ROW * COL * sizeof(*h_matrix)); + assert(h_matrix != NULL); + + /* device matrix */ + MyData *d_matrix = NULL; + cudaMalloc((void **)&d_matrix, ROW * COL * sizeof(*d_matrix)); + assert(d_matrix != NULL); + + /* 1D grid */ + const dim3 grid = {(((ROW * COL) + BLOCKSIZE - 1) / BLOCKSIZE), // number of blocks along X + 1, // number of blocks along Y + 1}; // number of blocks along Z + const dim3 block = {BLOCKSIZE, // number of threads per block along X + 1, // number of threads per block along Y + 1}; // number of threads per block along Z + + // kernel + GPUMatrix<<<grid, block>>>(d_matrix, (ROW * COL)); + + // device synchronization + cudaDeviceSynchronize(); + + /* fetch matrix from the device */ + cudaMemcpy(h_matrix, d_matrix, (ROW * COL * sizeof(*d_matrix)), cudaMemcpyDeviceToHost); + + /* free device memory */ + cudaFree(d_matrix); + + // check the result + int flag = 0; + for (size_t row=0 ; row<ROW ; row++) + { + const size_t i = (row * COL); + + for (size_t col=0 ; col<COL ; col++) + { + flag = ((h_matrix[i + col] != (i + col)) ? -1 : flag); + } /* col loop */ + } /* row loop */ + + // free host memory + free(h_matrix); + + if (flag) + printf("\n\t Result wrong! \n\n"); + else + printf("\n\t Result OK! \n\n"); + + return 0; +} diff --git a/cuda-omp/cuda/3/classwork.cu b/cuda-omp/cuda/3/classwork.cu new file mode 100644 index 0000000000000000000000000000000000000000..4cc27287f31d730cd89e2988b84aa7ad90384d9e --- /dev/null +++ b/cuda-omp/cuda/3/classwork.cu @@ -0,0 +1,145 @@ +////////////////////////////////////////////////////////////////////////////////////////////////// +// Assigment : write a CUDA code that does: +// - each thread generates a pair of points as (x, y) coordinates randomly distributed; +// - each thread computes the euclidean distance d = sqrt((x1 - x2)^2 + (y1 - y2)^2); +// - the kernel prints the maximum distance; +// - use one CUDA block and shared memory within the block +////////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 06.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc++ classwork.cu -o classwork -lm +// - Run the code: +// $ ./classwork +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <cuda.h> +#include <assert.h> + +#define NUMPOINTS 1024 +#define BLOCKSIZE NUMPOINTS + +#if NUMPOINTS != BLOCKSIZE +#error NUMPOINTS must be equal to BLOCKSIZE +#endif + +#if BLOCKSIZE > 1024 +#error BLOCKSIZE cannot be larger than 1024 +#endif + +#define SQUARE(A) ((A) * (A)) + +typedef double MyData; + +typedef struct PairPoints +{ + MyData x[2]; + MyData y[2]; +} PairPoints; + +__global__ void GPUDistance(const PairPoints *const points, + const int size) +{ + /* global thread's ID */ + const int globalID = (threadIdx.x + (blockIdx.x * blockDim.x)); + /* local (i.e. within the block) thread's ID */ + const int localID = threadIdx.x; + + if ((globalID >= size) || (globalID != localID)) + return; + + /* coalescent memory accesses */ + const PairPoints myPoints = points[localID]; + const MyData pair_distance_X2 = SQUARE(myPoints.x[0] - myPoints.x[1]); + const MyData pair_distance_Y2 = SQUARE(myPoints.y[0] - myPoints.y[1]); + const MyData pair_distance = sqrt(pair_distance_X2 + pair_distance_Y2); + + // shared-block memory statically allocated + __shared__ MyData distance[BLOCKSIZE]; + + /* store the distance in shared memory */ + distance[localID] = pair_distance; + // block level synchronization barrier + __syncthreads(); + + /* the master thread within the block gets the max */ + if (localID == 0) + { + MyData max_dis = -1.0; + for (size_t i=0 ; i<size ; i++) + max_dis = ((max_dis < distance[i]) ? distance[i] : max_dis); + + printf("\t GPU maximum distance: %lg\n", max_dis); + } + + return; +} + +void CPUMaxDistance(const PairPoints *const points, + const int size) +{ + MyData distance = -1.0; + for (size_t i=0 ; i<size ; i++) + { + const MyData pair_distance_X2 = SQUARE(points[i].x[0] - points[i].x[1]); + const MyData pair_distance_Y2 = SQUARE(points[i].y[0] - points[i].y[1]); + const MyData pair_distance = sqrt(pair_distance_X2 + pair_distance_Y2); + + distance = ((distance < pair_distance) ? pair_distance : distance); + } + + printf("\n\t CPU maximum distance: %lg\n", distance); + + return; +} + +int main() +{ + /* host allocation */ + PairPoints *h_points = (PairPoints *)malloc(NUMPOINTS * sizeof(*h_points)); + assert(h_points != NULL); + + /* device allocation */ + PairPoints *d_points = NULL; + cudaMalloc((void **)&d_points, (NUMPOINTS * sizeof(*d_points))); + + /* initialization */ + srand48(time(NULL)); + for(size_t i=0 ; i<NUMPOINTS ; i++) + { + h_points[i].x[0] = drand48(); + h_points[i].x[1] = drand48(); + h_points[i].y[0] = drand48(); + h_points[i].y[1] = drand48(); + } + + /* copy data to the device's memory */ + cudaMemcpy(d_points, h_points, (NUMPOINTS * sizeof(*d_points)), cudaMemcpyHostToDevice); + + const dim3 grid = {1, 1, 1}; + const dim3 block = {BLOCKSIZE, 1, 1}; + + // lunch kernel + GPUDistance<<< grid, block >>>(d_points, NUMPOINTS); + + // check the result on the host while the kernel is executing on the GPU + CPUMaxDistance(h_points, NUMPOINTS); + + free(h_points); + + // device synchronization + cudaDeviceSynchronize(); + + // free memory + cudaFree(d_points); + + return 0; +} diff --git a/cuda-omp/cuda/4/classwork.cu b/cuda-omp/cuda/4/classwork.cu new file mode 100644 index 0000000000000000000000000000000000000000..ac70f359fe8f45ac930c8c295e636140dd6aa949 --- /dev/null +++ b/cuda-omp/cuda/4/classwork.cu @@ -0,0 +1,178 @@ +////////////////////////////////////////////////////////////////////////////////////////////////// +// Assigment : write a CUDA code corresponding to the +// following sequential C code +// +// #include <stdio.h> +// int main() +// { +// /* Matrix multiplication: C = A X B */ +// +// int A[N][N], B[N][N], C[N][N]; +// +// for (int ii=0 ; ii<N ; ii++) +// for (int jj=0 ; jj<N ; jj++) +// for (int kk=0 ; kk<N ; kk++) +// C[ii][jj] += A[ii][kk] * B[kk][jj]; +// +// return 0; +// } +////////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 08.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc++ classwork.cu -o classwork +// - Run the code: +// $ ./classwork <N> +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <cuda.h> +#include <assert.h> + +#define BLOCKSIZE 32 +#define SQUARE(SIZE) ((SIZE) * (SIZE)) + +typedef long int MyData; + +void matrix_init( MyData *const AB, + const MyData N) +{ + MyData *const restrict A = AB; + MyData *const restrict B = AB + SQUARE(N); + + for (MyData ii=0 ; ii<N ; ii++) + for (MyData jj=0 ; jj<N ; jj++) + { + A[(ii * N) + jj] = (ii - jj); + B[(ii * N) + jj] = 1; + } + + return; +} + +__global__ void GPU_MM(const MyData *const __restrict__ A, + const MyData *const __restrict__ B, + MyData *const __restrict__ C, + const MyData N) +{ + const MyData IDx = threadIdx.x + (blockIdx.x * blockDim.x); + const MyData IDy = threadIdx.y + (blockIdx.y * blockDim.y); + + /* check out of boundaries */ + if ((IDx >= N) || (IDy >= N)) + return; + + /* each thread performs the calculation of one element */ + /* of the matrix, i.e. C[IDx][IDy] */ + + MyData sum = 0; + for (MyData kk=0 ; kk<N ; kk++) + sum += (A[(IDx * N) + kk] * B[(kk * N) + IDy]); + + C[(IDx * N) + IDy] = sum; + + return; +} + +void check(const MyData *const __restrict__ GPU_MM, + const MyData *const __restrict__ A, + const MyData N) +{ + int flag = 0; + + for (MyData ii=0 ; ii<N ; ii++) + { + MyData row_a = 0; + for (MyData kk=0 ; kk<N ; kk++) + row_a += A[(ii * N) + kk]; + + for (MyData jj=0 ; jj<N ; jj++) + if (GPU_MM[(ii * N) + jj] != row_a) + { + flag = 1; + break; + } + } + + if (flag) + printf("\n\t Result WRONG \n\n"); + else + printf("\n\t Result OK \n\n"); + + return; +} + +int main(int arg, char *argv[]) +{ + int N; + + if (arg < 2) + { + printf("\n\t Usage: $ ./classwork <number_of_matrix_elements> \n"); + exit(EXIT_FAILURE); + } + else + { + N = atoi(argv[1]); + if (N <= 0) + { + printf("\n\t Number of matrix elements must be greater than zero \n"); + exit(EXIT_FAILURE); + } + else + { + printf("\n\t Matrix size : %d x %d", N, N); + printf("\n\t CUDA block size: %d x %d", BLOCKSIZE, BLOCKSIZE); + } + } + + /* host memory allocation */ + MyData *h_buffer = (MyData *)malloc(3 * SQUARE(N) * sizeof(MyData)); + assert(h_buffer != NULL); + /* set-up host pointers */ + MyData *const restrict h_A = h_buffer; + MyData *const restrict h_B = h_A + SQUARE(N); + MyData *const restrict h_C = h_B + SQUARE(N); + + /* device memory allocation */ + MyData *d_buffer = NULL; + cudaMalloc((void **)&d_buffer, (3 * SQUARE(N) * sizeof(MyData))); + assert(d_buffer != NULL); + /* set-up device pointers */ + MyData *const restrict d_A = d_buffer; + MyData *const restrict d_B = d_A + SQUARE(N); + MyData *const restrict d_C = d_B + SQUARE(N); + + // matrix initialization + matrix_init(h_buffer, N); + + /* copy host data to device */ + cudaMemcpy(d_A, h_A, (2 * SQUARE(N) * sizeof(MyData)), cudaMemcpyHostToDevice); + + // kernel lunch on GPU + const size_t nblocks = ((N + BLOCKSIZE - 1) / BLOCKSIZE); + const dim3 grid = {nblocks, nblocks, 1}; + const dim3 block = {BLOCKSIZE, BLOCKSIZE, 1}; + GPU_MM<<< grid, block >>>(d_A, d_B, d_C, N); + + /* host-device data synchronization */ + /* N.B. cudaDeviceSynchronize() is not */ + /* necessary since cudaMemcpy() is blocking */ + cudaMemcpy(h_C, d_C, (SQUARE(N) * sizeof(MyData)), cudaMemcpyDeviceToHost); + + // free the GPU memory + cudaFree(d_buffer); + + // check the result + check(h_C, h_A, N); + + free(h_buffer); + + return 0; +} diff --git a/cuda-omp/cuda/5/aos_soa.cu b/cuda-omp/cuda/5/aos_soa.cu new file mode 100644 index 0000000000000000000000000000000000000000..74c024b74d5101b63547b877f75cc46e380dbaba --- /dev/null +++ b/cuda-omp/cuda/5/aos_soa.cu @@ -0,0 +1,239 @@ +//////////////////////////////////////////////////////////////////////////////////////////////// +// $ The program tests the SoA vs AoS approaches on GPU +// In computing, an array of structures (AoS), structure of arrays (SoA) or +// array of structures of arrays (AoSoA) are contrasting ways to arrange a sequence +// of records in memory, with regard to interleaving, and are of interest in SIMD +// and SIMT programming. +//////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 12.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc++ aos_soa.cu -o aos_soa +// - Run the code: +// $ ./aos_soa +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <iostream> +#include <vector> +#include <unistd.h> +#include <time.h> +#include <cuda.h> + +#define Byte_to_MB (1.0 / (1024 * 1024)) +#define LOOP 100 +#define BLOCKSIZE 1024 +#if BLOCKSIZE > 1024 +#error BLOCKSIZE cannot be larger than 1024 +#endif + +// datatypes +struct nodeAoS +{ + double a; + float b; + long int c; + int d; + char e; +}; + +struct nodeSoA +{ + double *a; + float *b; + long int *c; + int *d; + char *e; +}; + +void GPU_alloc_check(const cudaError error) +{ + using namespace std; + + if (error) + { + cout << "\t cudaMalloc fails! ... aborting ..." << endl; + exit(EXIT_FAILURE); + } + + return; +} + +double wall_time() +{ + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; + + return ret; +} + +__global__ void gpu_AoS(struct nodeAoS *const allnodeAoS, + const size_t size) +{ + const size_t ID = threadIdx.x + (blockIdx.x * blockDim.x); + + if (ID >= size) + return; + + // warp non-coalescing access ---> stride = sizeof(struct nodeAoS) + allnodeAoS[ID].a = (double)ID; + allnodeAoS[ID].b = (float)ID; + allnodeAoS[ID].c = (long int)ID; + allnodeAoS[ID].d = (int)ID; + allnodeAoS[ID].e = 'a'; + + return; +} + +__global__ void gpu_nodeAoS(struct nodeAoS *const allnodeAoS, + const size_t size) +{ + const size_t ID = threadIdx.x + (blockIdx.x * blockDim.x); + + const struct nodeAoS node = {(double)ID, (float)ID, (long int)ID, (int)ID, 'a'}; + + if (ID >= size) + return; + + // warp coalescing access ---> stride = 0 + allnodeAoS[ID] = node; + + return; +} + +__global__ void gpu_SoA(double *const __restrict__ a, + float *const __restrict__ b, + long int *const __restrict__ c, + int *const __restrict__ d, + char *const __restrict__ e, + const size_t size) +{ + const size_t ID = threadIdx.x + (blockIdx.x * blockDim.x); + + if (ID >= size) + return; + + // warp coalescing access ---> stride = 0 + a[ID] = (double)ID; + b[ID] = (float)ID; + c[ID] = (long int)ID; + d[ID] = (int)ID; + e[ID] = 'a'; + + return; +} + +size_t get_gpu_free_memory() +{ + using namespace std; + + int num_gpus{0}; + // get the number of available gpus on the system + cudaGetDeviceCount(&num_gpus); + + vector<size_t>free_memory(num_gpus); + vector<size_t>total_memory(num_gpus); + for (size_t gpu_id=0 ; gpu_id<num_gpus ; gpu_id++) + { + cudaSetDevice(gpu_id); + cudaMemGetInfo(&free_memory[gpu_id], &total_memory[gpu_id]); + cout << "\n\t GPU " << gpu_id << " - free memory: " << free_memory[gpu_id]*Byte_to_MB << "[MB] - total memory: " << total_memory[gpu_id]*Byte_to_MB << " [MB]" << endl; + } + + // return the free memory of the first GPU + return free_memory[0]; +} + +int main() +{ + using namespace std; + + // free gpu mem in MB + const size_t gpu_free_mem = get_gpu_free_memory(); + // number of elements in the AoS + const size_t N = (gpu_free_mem / 2 / sizeof(nodeAoS)); + + double start_time{0}; + double end_time{0}; + + const dim3 block{BLOCKSIZE, 1, 1}; + const dim3 nblocks{((N + BLOCKSIZE - 1) / BLOCKSIZE), 1, 1}; + + // allocate buffer for the GPU to store AoS and SoA + nodeAoS *gpu_allnodeAoS{nullptr}; + GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeAoS, (N * sizeof(*gpu_allnodeAoS)))); + + nodeSoA gpu_allnodeSoA = {nullptr, nullptr, nullptr, nullptr, nullptr}; + GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeSoA.a, (N * sizeof(*gpu_allnodeSoA.a)))); + GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeSoA.b, (N * sizeof(*gpu_allnodeSoA.b)))); + GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeSoA.c, (N * sizeof(*gpu_allnodeSoA.c)))); + GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeSoA.d, (N * sizeof(*gpu_allnodeSoA.d)))); + GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeSoA.e, (N * sizeof(*gpu_allnodeSoA.e)))); + + // gpu warm-up + gpu_AoS<<< nblocks, block >>>(gpu_allnodeAoS, N); + cudaDeviceSynchronize(); + + ////////////////////////// GPU AoS ///////////////////////////////////////////// + start_time = wall_time(); + for (int loop=0 ; loop<LOOP ; loop++) + gpu_AoS<<< nblocks, block >>>(gpu_allnodeAoS, N); + + cudaDeviceSynchronize(); + end_time = wall_time(); + cout << "\t GPU AoS time: " << ((end_time - start_time) / static_cast<double>(LOOP)) << " [s]" << endl; + //////////////////////////////////////////////////////////////////////////////// + + // gpu warm-up + gpu_nodeAoS<<< nblocks, block >>>(gpu_allnodeAoS, N); + cudaDeviceSynchronize(); + + ////////////////////////// GPU nodeAoS ///////////////////////////////////////// + start_time = wall_time(); + for (int loop=0 ; loop<LOOP ; loop++) + gpu_nodeAoS<<< nblocks, block >>>(gpu_allnodeAoS, N); + + cudaDeviceSynchronize(); + end_time = wall_time(); + cout << "\t GPU nodeAoS time: " << ((end_time - start_time) / static_cast<double>(LOOP)) << " [s]" << endl; + //////////////////////////////////////////////////////////////////////////////// + + // gpu warm-up + gpu_SoA<<< nblocks, block >>>(gpu_allnodeSoA.a, + gpu_allnodeSoA.b, + gpu_allnodeSoA.c, + gpu_allnodeSoA.d, + gpu_allnodeSoA.e, + N); + cudaDeviceSynchronize(); + + ////////////////////////// GPU SoA ///////////////////////////////////////////// + start_time = wall_time(); + for (int loop=0 ; loop<LOOP ; loop++) + gpu_SoA<<< nblocks, block >>>(gpu_allnodeSoA.a, + gpu_allnodeSoA.b, + gpu_allnodeSoA.c, + gpu_allnodeSoA.d, + gpu_allnodeSoA.e, + N); + + cudaDeviceSynchronize(); + end_time = wall_time(); + cout << "\t GPU SoA time: " << ((end_time - start_time) / static_cast<double>(LOOP)) << " [s]\n" << endl; + //////////////////////////////////////////////////////////////////////////////// + + // free GPU memory + cudaFree(gpu_allnodeAoS); + cudaFree(gpu_allnodeSoA.a); + cudaFree(gpu_allnodeSoA.b); + cudaFree(gpu_allnodeSoA.c); + cudaFree(gpu_allnodeSoA.d); + cudaFree(gpu_allnodeSoA.e); + + return EXIT_SUCCESS; +} diff --git a/cuda-omp/cuda/5/linked_list.cu b/cuda-omp/cuda/5/linked_list.cu new file mode 100644 index 0000000000000000000000000000000000000000..f7d277d71cace5b02bd01a832e7c6334b9d2e005 --- /dev/null +++ b/cuda-omp/cuda/5/linked_list.cu @@ -0,0 +1,208 @@ +//////////////////////////////////////////////////////////////////////////////////////////////// +// $ The program shows how to port a linked list, created by the CPU, onto the GPU +// +// nvcc compiler might issue the warning: Stack size for entry function … cannot be statically determined). +// The reason is the recursion function printLinkedList. +//////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 12.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc++ linked_list.cu -o linked_list +// - Run the code: +// $ ./linked_list +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <iostream> +#include <unistd.h> +#include <time.h> +#include <cuda.h> +#include <new> +#include <cassert> +#include <vector> + +#define N 10 // linked list size +#define BLOCKSIZE 1024 +#define CPU 0 +#define GPU 1 +#if (BLOCKSIZE > 1024) +#error BLOCKSIZE cannot be larger than 1024 +#endif + +// datatypes +struct node +{ + struct node *next; + int data; +}; + +typedef struct node node; +typedef struct node * nodeptr; + +// container where to store node pointers on the GPU +static std::vector<nodeptr> GPUptr; + +void GPU_check(const cudaError error, + const char *cudaFunc) +{ + using namespace std; + + if (error) + { + cout << "\t " << cudaFunc << " fails! ... aborting ..." << endl; + exit(EXIT_FAILURE); + } + + return; +} + +double wall_time() +{ + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; + + return ret; +} + +nodeptr createNode(const int data) +{ + nodeptr pn{nullptr}; + pn = new struct node; + + assert(pn != nullptr); + + pn->data = data; + pn->next = nullptr; + + return pn; +} + +nodeptr createLinkedList() +{ + nodeptr root{nullptr}; + + for (int size=N ; size>0 ; size--) + { + nodeptr pn = createNode(size); + pn->next = root; + root = pn; + } + + assert(root != nullptr); + + return root; +} + +void freeCpuLinkedList(nodeptr cpu_node) +{ + // free CPU linked list + if (cpu_node) + { + freeCpuLinkedList(cpu_node->next); + + delete cpu_node; + cpu_node = nullptr; + } + + return; +} + +void freeGpuLinkedList() +{ + // free GPU linked list + for (auto &el : GPUptr) + { + cudaFree(el); + el = nullptr; + } + + return; +} + +nodeptr copyNode(const nodeptr pn) +{ + assert(pn != nullptr); + + nodeptr gpu_node{nullptr}; + GPU_check(cudaMalloc((void **)&gpu_node, sizeof(*gpu_node)), "cudaMalloc"); + GPU_check(cudaMemcpy(gpu_node, pn, sizeof(*pn), cudaMemcpyHostToDevice), "cudaMemcpy"); + + GPUptr.push_back(gpu_node); + + return gpu_node; +} + +nodeptr copyList(const nodeptr pn) +{ + if (!pn) + return nullptr; + + node buffer; + buffer.next = copyList(pn->next); + buffer.data = pn->data; + + return copyNode(&buffer); +} + +// the function works on both CPU and GPU +__device__ __host__ void printLinkedList(const nodeptr root, + const int device) +{ + const char string[2][16] = {"\n\t CPU ---> ", "\n\t GPU ---> "}; + + if (root) + { + printf("%s", string[device]); + printf("Node->data %d", root->data); + printLinkedList(root->next, device); + } + else + printf("\n"); + + return; +} + +__global__ void GpuPrintLinkedList(const nodeptr root) +{ + const unsigned int ID = threadIdx.x + (blockIdx.x * blockDim.x); + + // master thread prints the linked list + if (!ID) + printLinkedList(root, GPU); + + return; +} + +int main() +{ + using namespace std; + + // create linked list on the CPU + nodeptr head = createLinkedList(); + + // print linked list on the CPU + printLinkedList(head, CPU); + + // copy linked list onto the GPU + nodeptr gpu_head = copyList(head); + + // print linked list on the GPU + GpuPrintLinkedList<<< 1, BLOCKSIZE >>>(gpu_head); + + // GPU syncronization + cudaDeviceSynchronize(); + + // free CPU memory + freeCpuLinkedList(head); + + // free GPU memory + freeGpuLinkedList(); + + return EXIT_SUCCESS; +} diff --git a/cuda-omp/cuda/6/classwork.cu b/cuda-omp/cuda/6/classwork.cu new file mode 100644 index 0000000000000000000000000000000000000000..d2559ffc50b4c6d225fa9e3c41f4ad9e6e007aa7 --- /dev/null +++ b/cuda-omp/cuda/6/classwork.cu @@ -0,0 +1,273 @@ +//////////////////////////////////////////////////////////////////////////////////////////////// +// - You are given 1024 x 1024 integer matrix M; +// - Each row is assigned to a thread-block; +// - Each thread is assigned a matrix element M[i][j]; +// - It changes M[i][j] to M[i][j] + M[i][j+1] (where possible); +// - Exploit shared-memory. +//////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 12.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc++ classwork.cu -o classwork +// - Run the code: +// $ ./classwork.cu +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <time.h> +#include <math.h> +#include <assert.h> +#include <cuda.h> + +#define N 512 +#define SIZE (N * N) // matrix size +typedef int64_t MyData; // do not change +#define BLOCKSIZE N // number of threads per block + +// sanity check +#if BLOCKSIZE > 1024 +#error BLOCKSIZE cannot be larger than 1024 +#endif + +#if BLOCKSIZE != N +#error BLOCKSIZE must be equal to N +#endif + +#define LOOP 100 + +double wall_time() +{ + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; + + return ret; +} + +__global__ void gpu_copy(const MyData *const __restrict__ A, + MyData *const __restrict__ B) +{ + const size_t globalID = (threadIdx.x + (blockIdx.x * blockDim.x)); + + if (globalID >= SIZE) + return; + + B[globalID] = A[globalID]; + + return; +} + +__global__ void gpu_matrix( MyData *const __restrict__ matrix, + const MyData *const __restrict__ buffer) +{ + // global thread ID + const size_t globalID = (threadIdx.x + (blockIdx.x * blockDim.x)); + + if (globalID >= SIZE) + return; + + // matrix row + const size_t row = (globalID / N); + //matrix column + const size_t col = (globalID % N); + + MyData myValue = buffer[(row * N) + col]; + + if (col < (N - 1)) + myValue += buffer[(row * N) + col + 1]; + + matrix[globalID] = myValue; + + return; +} + +__global__ void gpu_matrix_shared(MyData *const __restrict__ matrix) +{ + // thread-id within the thread-block + const size_t localID = threadIdx.x; + // global-thread-id + const size_t globalID = (threadIdx.x + (blockIdx.x * blockDim.x)); + + if (globalID >= SIZE) + return; + + // shared memory buffer + __shared__ MyData buffer[BLOCKSIZE + 1]; + // shared memory initialization + for (size_t i=localID ; i<(BLOCKSIZE + 1) ; i+=blockDim.x) + buffer[i] = 0; + + const size_t row = (globalID / N); + const size_t col = (globalID % N); + const size_t index = ((row * N) + col); + + // load data into shared memory + buffer[localID] = matrix[index]; + + // block-thread synchronization + __syncthreads(); + + // perform the calculation + const MyData value = (buffer[localID] + buffer[localID + 1]); + + // store data + matrix[index] = value; + + return; +} + +void GPU_matrix(MyData *const __restrict__ matrix, + MyData *const __restrict__ buffer, + const dim3 nblocks, + const dim3 block) +{ + // gpu_copy: gpu_A ---> gpu_buffer + gpu_copy<<< nblocks, block >>>(matrix, buffer); + + // perform the actual calculation M[i][j] += M[i][j+1] + gpu_matrix<<< nblocks, block >>>(matrix, buffer); + + // device synchronization + cudaDeviceSynchronize(); + + return; +} + +void GPU_matrix_shared(MyData *const __restrict__ matrix, + const dim3 nblocks, + const dim3 block) +{ + gpu_matrix_shared<<< nblocks, block >>>(matrix); + + // device synchronization + cudaDeviceSynchronize(); + + return; +} + +void CPU_matrix(MyData *const matrix) +{ + for (size_t i=0 ; i<N ; i++) + for (size_t j=0 ; j<N-1 ; j++) + matrix[(i * N) + j] += matrix[(i * N) + j + 1]; + + return; +} + +void check(const MyData *const __restrict__ cpu_matrix, + const MyData *const __restrict__ gpu_matrix, + const char *const __restrict__ msg) +{ + int flag; + for (size_t i=0 ; i<SIZE ; i++) + flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0); + + if (!flag) + printf("\n\t Result OK %s", msg); + else + printf("\n\t Result wrong %s", msg); + + return; +} + +int main() +{ + double time; + // host reference matrix A + MyData *cpu_A = (MyData *)malloc(SIZE * sizeof(MyData)); + assert (cpu_A != NULL); + for (size_t i=0 ; i<SIZE ; i++) + cpu_A[i] = (lrand48() % SIZE); + + // device reference matrix A + MyData *gpu_A = NULL; + cudaMalloc((void **)&gpu_A, (SIZE * sizeof(MyData))); + assert(gpu_A != NULL); + cudaMemcpy(gpu_A, cpu_A, (SIZE * sizeof(MyData)), cudaMemcpyHostToDevice); + + ////////////////////////// CPU MATRIX ////////////////////////////////////////// + // allocate a second matrix where to perfom the actual computation + MyData *cpu_matrix = (MyData *)malloc(SIZE * sizeof(MyData)); + assert(cpu_matrix != NULL); + + // perform the actual computation + time = 0.0; + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + memcpy(cpu_matrix, cpu_A, (SIZE * sizeof(*cpu_matrix))); + + const double start = wall_time(); + CPU_matrix(cpu_matrix); + time += (wall_time() - start); + } + printf("\n\t CPU time %lg [s]\n", (time / LOOP)); + //////////////////////////////////////////////////////////////////////////////// + + ///////////////////////////// GPU MATRIX /////////////////////////////////////// + // allocate a second matrix where to perform the actual computation + MyData *gpu_matrix = NULL; + cudaMalloc((void **)&gpu_matrix, (SIZE * sizeof(MyData))); + + // allocate a buffer matrix useful to copy the matrix before the computation + // in order to avoid thread race-conditions + MyData *gpu_buffer = NULL; + cudaMalloc((void **)&gpu_buffer, (SIZE * sizeof(MyData))); + + const dim3 block = {BLOCKSIZE, 1, 1}; + const dim3 nblocks = {((SIZE + BLOCKSIZE - 1) / BLOCKSIZE), 1, 1}; + + time = 0.0; + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + gpu_copy<<< nblocks, block>>>(gpu_A, gpu_matrix); + cudaDeviceSynchronize(); + + double start = wall_time(); + GPU_matrix(gpu_matrix, gpu_buffer, nblocks, block); + time += (wall_time() - start); + } + + // copy gpu_matrix into cpu_A + cudaMemcpy(cpu_A, gpu_matrix, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost); + + check(cpu_matrix, cpu_A, "\t global memory implementation --->"); + printf("\n\t GPU time %lg [s]\n", (time / LOOP)); + ////////////////////////////////////////////////////////////////////////////////////////// + + ////////////////////////// GPU MATRIX shared memory ////////////////////////////////////// + // gpu_buffer is not longer required + cudaFree(gpu_buffer); + + time = 0.0; + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + // restore data into gpu_matrix + gpu_copy<<< nblocks, block>>>(gpu_A, gpu_matrix); + cudaDeviceSynchronize(); + + double start = wall_time(); + GPU_matrix_shared(gpu_matrix, nblocks, block); + time += (wall_time() - start); + } + + // copy gpu_matrix into cpu_A + cudaMemcpy(cpu_A, gpu_matrix, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost); + check(cpu_matrix, cpu_A, "\t shared memory implementation --->"); + printf("\n\t GPU time %lg [s]\n", (time / LOOP)); + ////////////////////////////////////////////////////////////////////////////////////////// + + // free CPU-GPU memory + free(cpu_matrix); + free(cpu_A); + cudaFree(gpu_A); + cudaFree(gpu_matrix); + + return EXIT_SUCCESS; +} diff --git a/cuda-omp/cuda/miscellaneous/coalescing.cu b/cuda-omp/cuda/miscellaneous/coalescing.cu new file mode 100644 index 0000000000000000000000000000000000000000..94b2e9e1b7f1e3b679ae103cd26e7cf30de1ff10 --- /dev/null +++ b/cuda-omp/cuda/miscellaneous/coalescing.cu @@ -0,0 +1,148 @@ +////////////////////////////////////////////////////////////////////////////////////////////////// +// Memory coalescing is a technique which allows optimal usage of the global memory bandwidth. +// That is, when parallel threads running the same instruction access to consecutive locations +// in the global memory, the most favorable access pattern is achieved. +// +// Assigment : write a kernel to vary the degree of coalescing from 1 to 32 based on input +// argument +// +// Hint: The Degree of Coalescing (DoC) is the inverse (33 -) the number of memory transactions +// required for a warp to execute an instruction. +// Example: +// - buffer[threadIdx.x] has DoC of 32; +// - buffer[rand()] has DoC of 1 in the worst case. +////////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 12.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc++ coalescing.cu -o coalescing +// - Run the code: +// $ ./coalescing +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdlib.h> +#include <cuda.h> +#include <stdio.h> + +typedef double MyData; +#define N 1024 +#define WARP 32 +#define BLOCK 1024 + +__global__ void GPUDoC(const MyData *const __restrict__ source, + MyData *const __restrict__ sink, + const size_t DoC) +{ + const size_t myID = (threadIdx.x + (blockIdx.x * blockDim.x)); + + if (myID >= WARP) + return; + + // load data using a given DoC + const size_t stride = ((WARP + 1) - DoC); + + // stride 1 (DoC=32) means 32 threads accessing one block of contiguous 32 words, so one memory transaction; + // stride 2 (DoC=31) means 32 threads accessing 32 words spread across a lenght of size 64, so 2 memory transactions; + // stride 3 (DoC=30) means 32 threads accessing 32 words spread across a lenght of size 96, so 3 memory transactions; + // stride 4 (DoC=29) means 32 threads accessing 32 words spread across a lenght of size 128, so 4 memory transactions; + // ... + // stride 32 (DoC=1) means 32 threads accessing 32 words spread across a lenght of size 1024, so 32 memory transactions. + + sink[myID] = source[myID * stride]; + + return; +} + +void CPU_GPU_memory_copy(const cudaError error) +{ + if (error) + { + printf("\n\t cudaMemcpy fails! ... aborting ..."); + exit(EXIT_FAILURE); + } + + return; +} + +void GPU_alloc_check(const cudaError error) +{ + if (error) + { + printf("\n\t cudaMalloc fails! ... aborting ... \n"); + exit(EXIT_FAILURE); + } + + return; +} + +void GPU_DoC(const MyData *const points, + const size_t DoC) +{ + //****************** GPU memory allocation ***************************************************// + MyData *source=NULL, *sink=NULL; + GPU_alloc_check(cudaMalloc((void **)&source, ((N + WARP) * sizeof(MyData)))); + sink = source + N; + //********************************************************************************************// + + //***************** Host to GPU memory copy **************************************************// + CPU_GPU_memory_copy(cudaMemcpy(source, points, (N * sizeof(MyData)), + cudaMemcpyHostToDevice)); + //********************************************************************************************// + + //***************** Kernel lunch *************************************************************// + // kernel is run using WARP threads + const size_t nblocks = ((WARP + BLOCK - 1) / BLOCK); + + printf("\n\t DoC: %ld", DoC); + printf("\n\t GPU_DoC<<< %ld, %ld >>>(source, sink, %ld)\n\n", nblocks, BLOCK, DoC); + + GPUDoC<<< nblocks, BLOCK >>>(source, sink, DoC); + + // GPU synchronization + cudaDeviceSynchronize(); + + // Free GPU memory + cudaFree(source); + + //********************************************************************************************// + + return; +} + +int main(int argv, char *argc[]) +{ + size_t DoC = 0; + + if (argv < 2) + { + printf("\n\t Usage: ./coalescing <degree of coalescing>\n\n"); + exit(EXIT_FAILURE); + } + else + { + DoC = atoi(argc[1]); + + if ((DoC < 1) || (DoC > 32)) + { + printf("\n\t 1 <= <degree of coalescing> <= 32 ... aborting ... \n\n"); + exit(EXIT_FAILURE); + } + } + + srand48(time(NULL)); + MyData *points = (MyData *)malloc(N * sizeof(*points)); + for (size_t i=0 ; i<N ; i++) + points[i] = drand48(); + + // lunch kernel + GPU_DoC(points, DoC); + + free(points); + + return EXIT_SUCCESS; +} diff --git a/cuda-omp/omp/1/classwork_1.c b/cuda-omp/omp/1/classwork_1.c index 6ff9cdffa4c67879983bdccb494e89f6e57ca086..f9ece471b3a05ce901ecfae7bf029e5dbe21fdc5 100644 --- a/cuda-omp/omp/1/classwork_1.c +++ b/cuda-omp/omp/1/classwork_1.c @@ -2,7 +2,7 @@ // // OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70' // -// Assigment : write a OMP-GPU code corresponding to the +// Assigment : write an OMP-GPU code corresponding to the // following sequential C code // // #include <stdio.h> @@ -20,9 +20,10 @@ // Author: David Goz // mail : david.goz@inaf.it // date : 06.07.2024 +// code tested using nvhpc // // - Compile the code to run on : -// $ nvcc classwork_1.c -o classwork_1_omp +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork_1.c -o classwork_1_omp // - Run the code: // $ ./classwork_1_omp // - Check the result: @@ -36,27 +37,73 @@ #define N 100 #define NThreads 1024 +#define NDEBUG + void GPUkernelSerial(const int size) { #pragma omp target { +#if !defined(NDEBUG) + if (!omp_is_initial_device()) printf("\n\t GPU is executing GPUkernelSerial\n" ); else printf("\n\t CPU is executing GPUkernelSerial\n" ); + +#endif /* NDEBUG */ + + const int whoAmI = omp_get_thread_num(); + + for (int i=0 ; i<size ; i++) + printf("Hello from OMP-GPU thread: %d - result %d\n", whoAmI, (i * i)); + } /* omp target region - implicit barrier */ + + return; +} + +void GPUkernelParallel(const int size) +{ +#pragma omp target + { +#if !defined(NDEBUG) + if (!omp_is_initial_device()) + printf("\n\t GPU is executing GPUkernelSerial\n" ); + else + printf("\n\t CPU is executing GPUkernelSerial\n" ); + +#endif /* NDEBUG */ + + #pragma omp teams distribute parallel for for (int i=0 ; i<size ; i++) - printf("Hello from OMP-GPU thread: %d - result %d\n", i, (i * i)); - } + { + /* get CUDA blockIdx.x */ + const int team = omp_get_team_num(); + + /* get CUDA threadIdx.x */ + const int tid = omp_get_thread_num(); + + /* get CUDA blockDim.x */ + const int nthr = omp_get_num_threads(); + + const int whoAmI = tid + (team * nthr); + + printf("Hello from OMP-GPU thread: %d - result %d\n", whoAmI, (i * i)); + } + } /* omp target region - implicit barrier */ + return; } int main() { - printf("\n\t The host issues the kernel on the GPU \n"); - + printf("\n\t The host issues the kernel on the GPU in serial\n"); /* kernel lunch using one GPU thread */ GPUkernelSerial(N); + printf("\n\t The host issues the kernel on the GPU in parallel\n"); + /* kernel lunch using N GPU threads */ + GPUkernelParallel(N); + return 0; } diff --git a/cuda-omp/omp/1/classwork_2.c b/cuda-omp/omp/1/classwork_2.c new file mode 100644 index 0000000000000000000000000000000000000000..9842ebfe062b9fa10e622606a02fdcdd62fa2b86 --- /dev/null +++ b/cuda-omp/omp/1/classwork_2.c @@ -0,0 +1,112 @@ +////////////////////////////////////////////////////////////////////////////////////////////////// +// +// OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70' +// +// Assigment : write an OMP-GPU code corresponding to the +// following sequential C code +// +// #include <stdio.h> +// #define N 100 +// int main() +// { +// int A[N]; +// +// for (int i=0 ; i<N ; i++) +// A[i] = (i * i); +// +// return 0; +// } +////////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 06.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork_2.c -o classwork_2_omp +// - Run the code: +// $ ./classwork_2_omp + +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdlib.h> +#include <stdio.h> +#include <assert.h> +#include <omp.h> + +#define N 100 +#define NDEBUG + +void GPUkernel( int *A, + const int size) +{ + /* map A to the address space of the accelerator */ +#pragma omp target data map(from: A[0:size]) + { + /* kernel to be executed on the accelerator */ + #pragma omp target + { + /* create a bunch of teams (CUDA blocks) */ + #pragma omp teams + { + /* distribute the teams over index iterations and */ + /* spawn a bunch of threads within each team */ + #pragma omp distribute parallel for + for (int i=0 ; i<N ; i++) + { +#if !defined(NDEBUG) + + const int team = omp_get_team_num(); + const int nteam = omp_get_num_teams(); + const int tid = omp_get_thread_num(); + const int nthr = omp_get_num_threads(); + + const int whoAmI = tid + (team * nthr); + + if (!omp_is_initial_device()) + { + if (whoAmI == 0) + { + printf("\n\t GPU is executing GPUkernel\n" ); + printf("\n\t team : %d", team); + printf("\n\t nteam: %d", nteam); + printf("\n\t tid : %d", tid); + printf("\n\t nthr : %d\n", nthr); + } + } + else + printf("\n\t CPU is executing GPUkernel\n" ); + +#endif /* NDEBUG */ + + A[i] = (i * i); + } /* omp parallel for */ + } /* omp teams */ + } /* omp target */ + } /* omp target data */ + + return; +} + +int main() +{ + /* host array */ + int *A = (int *)malloc(N * sizeof(*A)); + assert(A != NULL); + + // kernel lunch + GPUkernel(A, N); + + // check the result + printf("\n"); + for (size_t i=0 ; i<N ; i++) + printf("\t A[%d] = %d", i, A[i]); + printf("\n\n"); + + // free host memory + free(A); + + return 0; +} diff --git a/cuda-omp/omp/2/classwork.c b/cuda-omp/omp/2/classwork.c new file mode 100644 index 0000000000000000000000000000000000000000..e568fee02df39a6e4c9938f3e419c10421381397 --- /dev/null +++ b/cuda-omp/omp/2/classwork.c @@ -0,0 +1,114 @@ +////////////////////////////////////////////////////////////////////////////////////////////////// +// +// OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70' +// +// Assigment : write a OMP-GPU code corresponding to the +// following sequential C code +// +// #include <stdio.h> +// #define ROW 4 +// #define COL 8 +// int main() +// { +// int Matrix[ROW * COL]; +// +// for (int row=0 ; row<ROW ; row++) +// for (int col=0 ; col<COL ; col++) +// Matrix[(row * COL) + col] = ((row * COL) + col); +// +// return 0; +// } +////////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 08.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp +// - Run the code: +// $ ./classwork_omp +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <omp.h> + +#define BLOCKSIZE 32 +#define ROW 1089 +#define COL 1111 + +#define NDEBUG + +typedef int MyData; + +void GPUMatrix(MyData *Matrix, + const int dim) +{ + /* exploit synchronously the GPU */ + /* allocate a buffer of size 'dim' and copy it to host memory at the end of kernel execution */ + /* create a given number of teams and distribute them across iterations */ + /* create a given number of threads per each team */ + +#pragma omp target \ + teams distribute num_teams(((ROW * COL) + BLOCKSIZE - 1) / BLOCKSIZE) \ + parallel for num_threads(BLOCKSIZE) \ + map(from: Matrix[0:dim]) + for (size_t index=0 ; index<dim ; index++) + { +#if !defined(NDEBUG) + + const int nteam = omp_get_num_teams(); + const int team = omp_get_team_num(); + const int tid = omp_get_thread_num(); + const int nthr = omp_get_num_threads(); + const int whoAmI = tid + (team * nthr); + + if (!omp_is_initial_device() && !whoAmI) + { + printf("\n\t GPU is running:"); + printf("\n\t\t Number of teams: %d - Number of threads per team: %d\n\n", nteam, nthr); + } + +#endif /* NDEBUG */ + + Matrix[index] = index; + } /* end-target region */ + + return; +} + +int main() +{ + /* host matrix . 1D mapping matrix[i][j] ---> matrix[(i * COL) + j] */ + MyData *matrix = (MyData *)malloc(ROW * COL * sizeof(*matrix)); + assert(matrix != NULL); + + // kernel + GPUMatrix(matrix, (ROW * COL)); + + // check the result + int flag = 0; + for (size_t row=0 ; row<ROW ; row++) + { + const size_t i = (row * COL); + + for (size_t col=0 ; col<COL ; col++) + { + flag = ((matrix[i + col] != (i + col)) ? -1 : flag); + } /* col loop */ + } /* row loop */ + + // free host memory + free(matrix); + + if (flag) + printf("\n\t Result wrong! \n\n"); + else + printf("\n\t Result OK! \n\n"); + + return 0; +} diff --git a/cuda-omp/omp/3/classwork.c b/cuda-omp/omp/3/classwork.c new file mode 100644 index 0000000000000000000000000000000000000000..6f069f560cde7688f54655edccae6a3da81ddf25 --- /dev/null +++ b/cuda-omp/omp/3/classwork.c @@ -0,0 +1,131 @@ +////////////////////////////////////////////////////////////////////////////////////////////////// +// +// OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70' +// +// Assigment : write a OMP-GPU code that does: +// - each thread generates a pair of points as (x, y) coordinates randomly distributed; +// - each thread computes the euclidean distance d = sqrt((x1 - x2)^2 + (y1 - y2)^2); +// - the kernel prints the maximum distance. +////////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 06.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp -lm +// - Run the code: +// $ ./classwork_omp +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <omp.h> +#include <time.h> +#include <assert.h> + +#define NUMPOINTS 1024 +#define SQUARE(A) ((A) * (A)) + +#define NDEBUG + +typedef double MyData; + +typedef struct PairPoints +{ + MyData x[2]; + MyData y[2]; +} PairPoints; + +void GPUMaxDistance(const PairPoints *const points, + const int size) +{ + /* exploit synchronously the GPU */ + /* - 'distance' must be mapped on the GPU (i.e. allocated on the global memory */ + /* of the GPU), otherwise is firstprivate by default (i.e. private to each */ + /* thread) and the host cannot access thread's private memory. */ + + MyData distance; + +#pragma omp target \ + teams distribute \ + parallel for reduction(max: distance) \ + map(to: points[0:size]) map(from: distance) + for (size_t i=0 ; i<size ; i++) + { +#if !defined(NDEBUG) + + const int nteam = omp_get_num_teams(); + const int team = omp_get_team_num(); + const int tid = omp_get_thread_num(); + const int nthr = omp_get_num_threads(); + const int whoAmI = tid + (team * nthr); + + if (!omp_is_initial_device() && !whoAmI) + { + printf("\n\t GPU is running:"); + printf("\n\t\t Number of teams: %d - Number of threads per team: %d\n\n", nteam, nthr); + } + +#endif /* NDEBUG */ + + const PairPoints myPoint = points[i]; + const MyData pair_distance_X2 = SQUARE(myPoint.x[0] - myPoint.x[1]); + const MyData pair_distance_Y2 = SQUARE(myPoint.y[0] - myPoint.y[1]); + const MyData pair_distance = sqrt(pair_distance_X2 + pair_distance_Y2); + + distance = ((distance < pair_distance) ? pair_distance : distance); + } + + printf("\n\t GPU maximum distance: %lg\n", distance); + + return; +} + +void CPUMaxDistance(const PairPoints *const points, + const int size) +{ + MyData distance = -1.0; + for (size_t i=0 ; i<size ; i++) + { + const MyData pair_distance_X2 = SQUARE(points[i].x[0] - points[i].x[1]); + const MyData pair_distance_Y2 = SQUARE(points[i].y[0] - points[i].y[1]); + const MyData pair_distance = sqrt(pair_distance_X2 + pair_distance_Y2); + + distance = ((distance < pair_distance) ? pair_distance : distance); + } + + printf("\n\t CPU maximum distance: %lg\n", distance); + + return; +} + +int main() +{ + /* host allocation */ + PairPoints *points = (PairPoints *)malloc(NUMPOINTS * sizeof(*points)); + assert(points != NULL); + + /* initialization */ + srand48(time(NULL)); + for(size_t i=0 ; i<NUMPOINTS ; i++) + { + points[i].x[0] = drand48(); + points[i].x[1] = drand48(); + points[i].y[0] = drand48(); + points[i].y[1] = drand48(); + } + + // lunch the kernel synchrounusly + GPUMaxDistance(points, NUMPOINTS); + + // check the result on the host + CPUMaxDistance(points, NUMPOINTS); + + free(points); + + return 0; +} diff --git a/cuda-omp/omp/3/classwork_async.c b/cuda-omp/omp/3/classwork_async.c new file mode 100644 index 0000000000000000000000000000000000000000..61c5f4e17f1a439687f5c7ef9d4d08c92bad2ca2 --- /dev/null +++ b/cuda-omp/omp/3/classwork_async.c @@ -0,0 +1,145 @@ +////////////////////////////////////////////////////////////////////////////////////////////////// +// +// OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70' +// +// Assigment : write a OMP-GPU code that does: +// - each thread generates a pair of points as (x, y) coordinates randomly distributed; +// - each thread computes the euclidean distance d = sqrt((x1 - x2)^2 + (y1 - y2)^2); +// - the kernel prints the maximum distance; +// - host manages asynchronously the GPU. +////////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 06.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork_async.c -o classwork_async_omp -lm +// - Run the code: +// $ ./classwork_async_omp +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <omp.h> +#include <time.h> +#include <assert.h> + +#define NUMPOINTS 65536 +#define SQUARE(A) ((A) * (A)) + +#define NDEBUG + +typedef double MyData; + +typedef struct PairPoints +{ + MyData x[2]; + MyData y[2]; +} PairPoints; + +/* global host-device variables (i.e. on both global memories) */ +/* a copy of 'GPUMaxDistance' is created in the global memory */ +/* of the GPU from the beginning of the program till its end. */ +#pragma omp declare target +MyData DeviceMaxDistance; +#pragma omp end declare target + +void GPUMaxDistance(const PairPoints *const points, + const int size) +{ +#pragma omp target nowait \ + teams distribute \ + parallel for reduction(max: DeviceMaxDistance) \ + map(to: points[0:size]) + for (size_t i=0 ; i<size ; i++) + { +#if !defined(NDEBUG) + + const int nteam = omp_get_num_teams(); + const int team = omp_get_team_num(); + const int tid = omp_get_thread_num(); + const int nthr = omp_get_num_threads(); + const int whoAmI = tid + (team * nthr); + + if (!omp_is_initial_device() && !whoAmI) + { + printf("\n\t GPU is running:"); + printf("\n\t\t Number of teams: %d - Number of threads per team: %d\n\n", nteam, nthr); + } + +#endif /* NDEBUG */ + + const PairPoints myPoint = points[i]; + const MyData pair_distance_X2 = SQUARE(myPoint.x[0] - myPoint.x[1]); + const MyData pair_distance_Y2 = SQUARE(myPoint.y[0] - myPoint.y[1]); + const MyData pair_distance = sqrt(pair_distance_X2 + pair_distance_Y2); + + DeviceMaxDistance = ((DeviceMaxDistance < pair_distance) ? pair_distance : DeviceMaxDistance); + } /* end-target region */ + + /* data synchronization host-device, the host fetches data from the GPU */ + /* By default, the host must wait for the completion of the fetching, */ + /* however the 'nowait' clause enables the target update construct to */ + /* execute asynchronously with respect to the encountering host thread. */ + #pragma omp target update from(DeviceMaxDistance) nowait + + return; +} + +void CPUMaxDistance(const PairPoints *const points, + const int size) +{ + MyData distance = -1.0; + for (size_t i=0 ; i<size ; i++) + { + const MyData pair_distance_X2 = SQUARE(points[i].x[0] - points[i].x[1]); + const MyData pair_distance_Y2 = SQUARE(points[i].y[0] - points[i].y[1]); + const MyData pair_distance = sqrt(pair_distance_X2 + pair_distance_Y2); + + distance = ((distance < pair_distance) ? pair_distance : distance); + } + + printf("\n\t CPU maximum distance: %lg\n", distance); + + return; +} + +int main() +{ + /* host allocation */ + PairPoints *points = (PairPoints *)malloc(NUMPOINTS * sizeof(*points)); + assert(points != NULL); + + /* initialization */ + srand48(time(NULL)); + for(size_t i=0 ; i<NUMPOINTS ; i++) + { + points[i].x[0] = drand48(); + points[i].x[1] = drand48(); + points[i].y[0] = drand48(); + points[i].y[1] = drand48(); + } + + /* lunch the kernel asynchrounusly (i.e. the host */ + /* does not wait the completion of GPU execution) */ + GPUMaxDistance(points, NUMPOINTS); + + // check the result on the host + CPUMaxDistance(points, NUMPOINTS); + + /* host-device synchronization in order to ensure that the */ + /* kernel has been executed on the GPU and the global */ + /* variable 'DeviceMaxDistance' is synchronized between */ + /* host and device data environment. */ + #pragma omp taskwait + + printf("\n\t GPU maximum distance: %lg\n", DeviceMaxDistance); + + free(points); + + return 0; +} diff --git a/cuda-omp/omp/4/classwork.c b/cuda-omp/omp/4/classwork.c new file mode 100644 index 0000000000000000000000000000000000000000..6fc63e79f3e28a0d6c72bbe492ea69b35cbcde7a --- /dev/null +++ b/cuda-omp/omp/4/classwork.c @@ -0,0 +1,183 @@ +////////////////////////////////////////////////////////////////////////////////////////////////// +// +// OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70' +// +// Assigment : write a OMP-GPU code corresponding to the +// following sequential C code +// +// #include <stdio.h> +// int main() +// { +// /* Matrix multiplication: C = A X B */ +// +// int A[N][N], B[N][N], C[N][N]; +// +// for (int ii=0 ; ii<N ; ii++) +// for (int jj=0 ; jj<N ; jj++) +// for (int kk=0 ; kk<N ; kk++) +// C[ii][jj] += A[ii][kk] * B[kk][jj]; +// +// return 0; +// } +////////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 08.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp +// - Run the code: +// $ ./classwork_omp <N> +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <omp.h> +#include <assert.h> +//#define NDEBUG + +#define BLOCKSIZE 32 +#define SQUARE(SIZE) ((SIZE) * (SIZE)) + +typedef long int MyData; + +void matrix_init( MyData *const AB, + const MyData N) +{ + MyData *const restrict A = AB; + MyData *const restrict B = AB + SQUARE(N); + + for (MyData ii=0 ; ii<N ; ii++) + for (MyData jj=0 ; jj<N ; jj++) + { + A[(ii * N) + jj] = (ii - jj); + B[(ii * N) + jj] = 1; + } + + return; +} + +void GPU_MM(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const MyData N) +{ + /* spawnig N^2 threads */ + /* mapping is not specify because we rely on 'omp target enter/exit data construct' */ + +#pragma omp target \ + teams distribute num_teams((SQUARE(N) + BLOCKSIZE - 1)/BLOCKSIZE) \ + parallel for collapse(2) num_threads(BLOCKSIZE) + for (MyData ii=0 ; ii<N ; ii++) + { + for (MyData jj=0 ; jj<N ; jj++) + { +#if !defined(NDEBUG) + + /* ID within the team (CUDA block) */ + const MyData localID = omp_get_thread_num(); + /* Team ID (ID CUDA block) */ + const MyData team = omp_get_team_num(); + /* Team size (CUDA block size) */ + const MyData nthr = omp_get_num_threads(); + /* global thread index */ + const MyData globalID = (localID + (team * nthr)); + + printf("\n\t globalID: %ld - ii: %ld - jj: %ld\n", globalID, ii, jj); + +#endif /* NDEBUG */ + + MyData sum = 0; + for (MyData kk=0 ; kk<N ; kk++) + { + sum += (A[(ii * N) + kk] * B[(kk * N) + jj]); + } /* kk loop */ + C[(ii * N) + jj] = sum; + } /* jj loop */ + } /* ii loop */ + + return; +} + +void check(const MyData *const __restrict__ GPU_MM, + const MyData *const __restrict__ A, + const MyData N) +{ + int flag = 0; + + for (MyData ii=0 ; ii<N ; ii++) + { + MyData row_a = 0; + for (MyData kk=0 ; kk<N ; kk++) + row_a += A[(ii * N) + kk]; + + for (MyData jj=0 ; jj<N ; jj++) + if (GPU_MM[(ii * N) + jj] != row_a) + { + flag = 1; + break; + } + } + + if (flag) + printf("\n\t Result WRONG \n\n"); + else + printf("\n\t Result OK \n\n"); + + return; +} + +int main(int arg, char *argv[]) +{ + MyData N; + + if (arg < 2) + { + printf("\n\t Usage: $ ./classwork <number_of_matrix_elements> \n"); + exit(EXIT_FAILURE); + } + else + { + N = atoi(argv[1]); + if (N <= 0) + { + printf("\n\t Number of matrix elements must be greater than zero \n"); + exit(EXIT_FAILURE); + } + else + { + printf("\n\t Matrix size : %d x %d", N, N); + } + } + + /* host memory allocation */ + MyData *buffer = (MyData *)malloc(3 * SQUARE(N) * sizeof(MyData)); + assert(buffer != NULL); + /* set-up host pointers */ + MyData *const restrict A = buffer; + MyData *const restrict B = A + SQUARE(N); + MyData *const restrict C = B + SQUARE(N); + + // matrix initialization + matrix_init(buffer, N); + + /* device memory allocation and copy */ +#pragma omp target enter data map(to : A[0:SQUARE(N)]) \ + map(to : B[0:SQUARE(N)]) \ + map(alloc: C[0:SQUARE(N)]) + + GPU_MM(A, B, C, N); + + /* host-device data synchronization */ +#pragma omp target exit data map(from: C[0:SQUARE(N)]) + + // check the result + check(C, A, N); + + free(buffer); + + return 0; +} diff --git a/cuda-omp/omp/6/classwork.c b/cuda-omp/omp/6/classwork.c new file mode 100644 index 0000000000000000000000000000000000000000..a7605b6813feb2a1e909bef627c03f58b85aeb30 --- /dev/null +++ b/cuda-omp/omp/6/classwork.c @@ -0,0 +1,325 @@ +//////////////////////////////////////////////////////////////////////////////////////////////// +// - You are given 512 x 512 integer matrix M; +// - Each row is assigned to a thread-block; +// - Each thread is assigned a matrix element M[i][j]; +// - It changes M[i][j] to M[i][j] + M[i][j+1] (where possible); +// - Exploit shared-memory. +//////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 14.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp +// - Run the code: +// $ ./classwork_omp +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <time.h> +#include <assert.h> +#include <omp.h> +#include <string.h> + +#define N 512 +#define SIZE (N * N) // matrix size +typedef int64_t MyData; // do not change +#define BLOCKSIZE N // number of threads per block + +// sanity check +#if BLOCKSIZE > 1024 +#error BLOCKSIZE cannot be larger than 1024 +#endif + +#if BLOCKSIZE != N +#error BLOCKSIZE must be equal to N +#endif + +#define LOOP 100 +#define NDEBUG + +double wall_time() +{ + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; + + return ret; +} + +void CPU_matrix(MyData *const matrix) +{ + for (size_t i=0 ; i<N ; i++) + for (size_t j=0 ; j<N-1 ; j++) + matrix[(i * N) + j] += matrix[(i * N) + j + 1]; + + return; +} + +void GPU_matrix_loops( MyData *const restrict matrix, + const MyData *const restrict buffer) +{ +#pragma omp target teams distribute num_teams(N) + for (size_t i=0 ; i<N ; i++) + { + #pragma omp parallel for num_threads(N) + for (size_t j=0 ; j<N ; j++) + { +#if !defined(NDEBUG) + + const int nteam = omp_get_num_teams(); + const int team = omp_get_team_num(); + const int tid = omp_get_thread_num(); + const int nthr = omp_get_num_threads(); + const int whoAmI = tid + (team * nthr); + + if (!omp_is_initial_device() && !tid) + printf("\n\t\t Team: %d - nthr: %d - whoAmI: %d - i: %ld - j: %ld\n\n", + team, nthr, whoAmI, i, j); + +#endif /* NDEBUG */ + + matrix[(i * N) + j] = ((j < N-1) ? + (buffer[(i * N) + j] + buffer[(i * N) + j + 1]) : + buffer[(i * N) + j]); + } /* j loop distributed across threads within a team */ + } /* i loop distributed across teams */ + + return; +} + +void GPU_matrix_no_loops( MyData *const restrict matrix, + const MyData *const restrict buffer) +{ + #pragma omp target + { + #pragma omp teams num_teams(N) + { + const size_t team = omp_get_team_num(); + + #pragma omp parallel firstprivate(matrix, buffer, team) num_threads(N) + { + const size_t tid = omp_get_thread_num(); + const size_t globalID = ((team * N) + tid); + + matrix[globalID] = ((tid < (N - 1)) ? + (buffer[globalID] + buffer[globalID + 1]) : + buffer[globalID]); + } /* omp parallel */ + } /* omp teams */ + } /* omp target */ + + return; +} + +void GPU_matrix_shared_loops(MyData *const restrict matrix) +{ + #pragma omp target teams distribute num_teams(N) + for (size_t i=0 ; i<N ; i++) + { + MyData buffer[BLOCKSIZE + 1]; + + #pragma omp parallel for shared(i, buffer) num_threads(N) + for (size_t j=0 ; j<N ; j++) + { +#if !defined(NDEBUG) + + const int nteam = omp_get_num_teams(); + const int team = omp_get_team_num(); + const int tid = omp_get_thread_num(); + const int nthr = omp_get_num_threads(); + const int whoAmI = tid + (team * nthr); + + if (!omp_is_initial_device() && !tid) + printf("\n\t\t Team: %d - nthr: %d - whoAmI: %d - i: %ld - j: %ld\n\n", + team, nthr, whoAmI, i, j); + +#endif /* NDEBUG */ + + /* shared memory initialization */ + for (size_t item=j ; item<(BLOCKSIZE + 1) ; item+=BLOCKSIZE) + buffer[item] = (MyData)0; + + /* load data into shared memory */ + const size_t globalID = ((i * N) + j); + + buffer[j] = matrix[globalID]; + } /* implicit thread-block synchronization */ + + #pragma omp parallel for shared(i , buffer) num_threads(N) + for (size_t j=0 ; j<N ; j++) + { + const size_t globalID = ((i * N) + j); + + matrix[globalID] = (buffer[j] + buffer[j + 1]); + } /* implicit thread-block synchronization */ + } /* i loop distributed across teams */ + + return; +} + +void GPU_matrix_shared_no_loops(MyData *const restrict matrix) +{ + #pragma omp target + { + #pragma omp teams num_teams(N) + { + MyData buffer[BLOCKSIZE + 1]; + + const size_t team_N = (N * omp_get_team_num()); + + #pragma omp parallel shared(buffer) firstprivate(team_N, matrix) num_threads(N) + { + const size_t tid = omp_get_thread_num(); + + for (size_t item=tid ; item<(BLOCKSIZE + 1) ; item+=BLOCKSIZE) + buffer[item] = (MyData)0; + + buffer[tid] = matrix[team_N + tid]; + } + + #pragma omp parallel shared(buffer) firstprivate(team_N, matrix) num_threads(N) + { + const size_t tid = omp_get_thread_num(); + + matrix[team_N + tid] = (buffer[tid] + buffer[tid + 1]); + } + } /* omp teams */ + } /* omp target */ + + return; +} + +void check(const MyData *const __restrict__ cpu_matrix, + const MyData *const __restrict__ gpu_matrix, + const char *const __restrict__ msg) +{ + int flag; + for (size_t i=0 ; i<SIZE ; i++) + flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0); + + if (!flag) + printf("\n\t Result OK %s", msg); + else + printf("\n\t Result wrong %s", msg); + + return; +} + +int main() +{ + double time; + // host reference matrix A + MyData *A = (MyData *)malloc(SIZE * sizeof(MyData)); + assert (A != NULL); + for (size_t i=0 ; i<SIZE ; i++) + A[i] = (lrand48() % SIZE); + + ////////////////////////// CPU MATRIX ////////////////////////////////////////// + // allocate a second matrix where to perfom the actual computation + MyData *matrix = (MyData *)malloc(SIZE * sizeof(MyData)); + assert(matrix != NULL); + + // perform the actual computation + time = 0.0; + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + memcpy(matrix, A, (SIZE * sizeof(MyData))); + + const double start = wall_time(); + CPU_matrix(matrix); + time += (wall_time() - start); + } + printf("\n\t CPU time %lg [s]\n", (time / LOOP)); + //////////////////////////////////////////////////////////////////////////////// + + ////////////////////////////// GPU MATRIX ////////////////////////////////////// + MyData *gpu_matrix = (MyData *)malloc(SIZE * sizeof(MyData)); + assert(gpu_matrix != NULL); + +#pragma omp target data map(to: A[0:SIZE]) map(from: gpu_matrix[0:SIZE]) + { + time = 0.0; + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + double start = wall_time(); + GPU_matrix_loops(gpu_matrix, A); + time += (wall_time() - start); + } + } /* omp target data */ + + check(matrix, gpu_matrix, "\t global memory implementation with loops --->"); + printf("\n\t GPU time %lg [s]\n", (time / LOOP)); + ////////////////////////////////////////////////////////////////////////////////////////// + + ////////////////////////////// GPU MATRIX ////////////////////////////////////// + +#pragma omp target data map(to: A[0:SIZE]) map(from: gpu_matrix[0:SIZE]) + { + time = 0.0; + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + double start = wall_time(); + GPU_matrix_no_loops(gpu_matrix, A); + time += (wall_time() - start); + } + } /* omp target data */ + + check(matrix, gpu_matrix, "\t global memory implementation without loops --->"); + printf("\n\t GPU time %lg [s]\n", (time / LOOP)); + ////////////////////////////////////////////////////////////////////////////////////////// + + ////////////////////////// GPU MATRIX shared memory ////////////////////////////////////// + memcpy(gpu_matrix, A, (SIZE * sizeof(MyData))); + /* gpu_matrix is allocated on the GPU */ + #pragma omp target enter data map(alloc: gpu_matrix[0:SIZE]) + + time = 0.0; + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + // restore data into gpu_matrix + #pragma omp target update to(gpu_matrix[0:SIZE]) + + double start = wall_time(); + GPU_matrix_shared_loops(gpu_matrix); + time += (wall_time() - start); + } + + #pragma omp target exit data map(from: gpu_matrix[0:SIZE]) + check(matrix, gpu_matrix, "\t shared memory implementation with loops --->"); + printf("\n\t GPU time %lg [s]\n", (time / LOOP)); + ////////////////////////////////////////////////////////////////////////////////////////// + + ////////////////////////// GPU MATRIX shared memory ////////////////////////////////////// + memcpy(gpu_matrix, A, (SIZE * sizeof(MyData))); + /* gpu_matrix is allocated on the GPU */ + #pragma omp target enter data map(alloc: gpu_matrix[0:SIZE]) + + time = 0.0; + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + // restore data into gpu_matrix + #pragma omp target update to(gpu_matrix[0:SIZE]) + + double start = wall_time(); + GPU_matrix_shared_no_loops(gpu_matrix); + time += (wall_time() - start); + } + + #pragma omp target exit data map(from: gpu_matrix[0:SIZE]) + check(matrix, gpu_matrix, "\t shared memory implementation without loops --->"); + printf("\n\t GPU time %lg [s]\n", (time / LOOP)); + ////////////////////////////////////////////////////////////////////////////////////////// + + // free CPU memory + free(gpu_matrix); + free(matrix); + free(A); + + return EXIT_SUCCESS; +} diff --git a/cuda-omp/omp/6/classwork_omp b/cuda-omp/omp/6/classwork_omp new file mode 100755 index 0000000000000000000000000000000000000000..6766c7bdf174daf8fcd104c4bc67d02ba4885384 Binary files /dev/null and b/cuda-omp/omp/6/classwork_omp differ diff --git a/cuda-omp/omp/miscellaneous/map_type_modifier.c b/cuda-omp/omp/miscellaneous/map_type_modifier.c new file mode 100644 index 0000000000000000000000000000000000000000..65396c9473bf51f774493ae8246652f41ccdbe29 --- /dev/null +++ b/cuda-omp/omp/miscellaneous/map_type_modifier.c @@ -0,0 +1,69 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 29.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v map_type_modifier.c -o map_type_modifier_omp +// - Run the code: +// $ export OMP_TARGET_OFFLOAD=mandatory +// $ ./map_type_modifier_omp +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <omp.h> + +#define N 8 +typedef float MyData; + +void allocate( MyData **buffer, + const size_t size) +{ + /* allocate the buffer on the host memory */ + MyData *tmp = malloc(size * sizeof(**buffer)); + assert(tmp != NULL); + *buffer = tmp; + +#pragma omp target enter data map(alloc: tmp[0:size]) + + return; +} + +void print(const MyData *const A, + const int size) +{ + printf("\n"); + for (size_t i=0 ; i<N ; i++) + printf("\n\t array[%i] = %lg", i, A[i]); + printf("\n\n"); + + return; +} + +int main() +{ + MyData *array = NULL; + allocate(&array, N); + + /* init on the GPU */ + #pragma omp target + { + for (size_t i=0 ; i<N ; i++) + array[i] = (MyData)(i); + } + + #pragma omp target update from(array[0:N]) + print(array, N); + + /* free the device's memory */ + #pragma omp target exit data map(delete: array[0:N]) + + /* free the host's memory */ + free(array); + + return 0; +} diff --git a/cuda-omp/omp/miscellaneous/structure.c b/cuda-omp/omp/miscellaneous/structure.c new file mode 100644 index 0000000000000000000000000000000000000000..c12b212f98c7a4d84a3c80829dcd2c305b47295e --- /dev/null +++ b/cuda-omp/omp/miscellaneous/structure.c @@ -0,0 +1,82 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 31.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v structure.c -o structure_omp +// - Run the code: +// $ export OMP_TARGET_OFFLOAD=mandatory +// $ ./structure_omp +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <omp.h> + +#define N 8 +typedef double MyData; + +typedef struct my_span +{ + size_t N; + MyData *ptr; +} span; + +void allocate( span *my_struct, + const size_t size) +{ + span tmp; + /* allocate the buffer on the host memory */ + tmp.ptr = (MyData *)malloc(size * sizeof(MyData)); + assert(tmp.ptr != NULL); + tmp.N = size; + + /* declare how the object 'span' has to be mapped on the device */ + #pragma omp declare mapper(span tmp) map(from: tmp, tmp.ptr[0: tmp.N]) + + my_struct->ptr = tmp.ptr; + my_struct->N = tmp.N; + + return; +} + +void print(const span *const A) +{ + printf("\n"); + for (size_t i=0 ; i<A->N ; i++) + printf("\n\t array[%i] = %lg", i, A->ptr[i]); + printf("\n\n"); + + return; +} + +int main() +{ + span A, B; + + allocate(&A, N); + allocate(&B, N); + + /* init on the GPU */ + #pragma omp target + { + for (size_t i=0 ; i<N ; i++) + { + A.ptr[i] = (MyData)(i); + B.ptr[i] = (MyData)(2 * i); + } + } + + print(&A); + print(&B); + + /* free the host's memory */ + free(A.ptr); + free(B.ptr); + + return 0; +}