diff --git a/cuda-omp/omp/7/mat_mult.c b/cuda-omp/omp/7/mat_mult.c new file mode 100644 index 0000000000000000000000000000000000000000..8ad7fcd4c5ca812149749ffdfe12cbb98ab7ab32 --- /dev/null +++ b/cuda-omp/omp/7/mat_mult.c @@ -0,0 +1,185 @@ +//////////////////////////////////////////////////////////////////////////////////////////////// +// - Naive matrix multiplication algorithm +// for (size_t i=0 ; i<N ; i++) +// for (size_t j=0 ; j<N ; j++) +// for (size_t k=0 ; k<_N ; k++) +// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; +// +//////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 31.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp +// - Run the code: +// $ ./classwork_omp +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <time.h> +#include <assert.h> +#include <omp.h> +#include <string.h> + +#define N 512 +#define SIZE (N * N) // matrix size +typedef double MyData; // do not change + +#define LOOP 100 +#define NDEBUG + +double wall_time() +{ + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; + + return ret; +} + +void CPU_mat_mult(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + for (size_t i=0 ; i<size ; i++) + for (size_t j=0 ; j<size ; j++) + for (size_t k=0 ; k<size ; k++) + C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]); + + return; +} + +void GPU_mat_mult(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + #pragma omp target + { + #pragma omp teams distribute num_teams(size) + for (size_t i=0 ; i<size ; i++) + { + #pragma omp parallel for num_threads(size) + for (size_t j=0 ; j<size ; j++) + { + MyData value = (MyData)0; + for (size_t k=0 ; k<size ; k++) + value += (A[(i * size) + k] * B[(k * size) + j]); + + C[(i * size) + j] = value; + } // omp thread + } // omp teams + } // omp target + + return; +} + +void GPU_mat_mult_no_loops(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + #pragma omp target + { + #pragma omp teams num_teams(size) + { + const size_t team_size = (size * omp_get_team_num()); + + #pragma omp parallel firstprivate(team_size) num_threads(size) + { + const size_t tid = omp_get_thread_num(); + MyData value = (MyData)0; + for (size_t k=0 ; k<size ; k++) + value += (A[team_size + k] * B[(k * size) + tid]); + + C[team_size + tid] = value; + } // omp threads + } // omp teams + } // omp target + + return; +} + +void check(const MyData *const __restrict__ cpu_matrix, + const MyData *const __restrict__ gpu_matrix) +{ + int flag; + for (size_t i=0 ; i<SIZE ; i++) + flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0); + + if (!flag) + printf("\n\t Result OK"); + else + printf("\n\t Result wrong"); + + return; +} + +int main() +{ + double time; + MyData *buffer = (MyData *)calloc(4 * SIZE, sizeof(MyData)); + assert(buffer != NULL); + + // host reference matrix A + MyData *const restrict A = buffer; + MyData *const restrict B = A + SIZE; + MyData *const restrict C_CPU = B + SIZE; + MyData *const restrict C_GPU = C_CPU + SIZE; + for (size_t i=0 ; i<SIZE ; i++) + { + A[i] = drand48(); + B[i] = drand48(); + } + + ////////////////////////// CPU naive algorithm ////////////////////////////////////////// + CPU_mat_mult(A, B, C_CPU, N); + ///////////////////////////////////////////////////////////////////////////////////////// + + // copy/alloc data to the GPU + #pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE]) + + /////////////////////////// GPU naive algorithm //////////////////////////////////////// + time = 0.0; + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + const double start = wall_time(); + GPU_mat_mult(A, B, C_GPU, N); + time += (wall_time() - start); + } + + #pragma omp target update from(C_GPU[0:SIZE]) + check(C_CPU, C_GPU); + printf("\n\t GPU naive time %lg [s]\n", (time / LOOP)); + //////////////////////////////////////////////////////////////////////////////// + + /////////////////////////// GPU naive no loops algorithm //////////////////////////// + time = 0.0; + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + const double start = wall_time(); + GPU_mat_mult_no_loops(A, B, C_GPU, N); + time += (wall_time() - start); + } + + #pragma omp target update from(C_GPU[0:SIZE]) + check(C_CPU, C_GPU); + printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP)); + //////////////////////////////////////////////////////////////////////////////// + + // free CPU memory + free(buffer); + // free GPU memory + #pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C_CPU[0:SIZE]) + + printf("\n"); + + return EXIT_SUCCESS; +} diff --git a/cuda-omp/omp/7/mat_mult_block.c b/cuda-omp/omp/7/mat_mult_block.c new file mode 100644 index 0000000000000000000000000000000000000000..cd84ad1ee00f8f85afc523f058ceb3b6f29cb922 --- /dev/null +++ b/cuda-omp/omp/7/mat_mult_block.c @@ -0,0 +1,208 @@ +//////////////////////////////////////////////////////////////////////////////////////////////// +// - Block matrix multiplication algorithm +// +// const size_t Nblocks = (N / Bsize); +// +// // loop over blocks of matrix C +// for (size_t ib=0 ; ib<Nblocks ; ib++) +// { +// for (size_t jb=0 ; jb<Nblocks ; jb++) +// { +// +// // loop over blocks of rows of A +// for (size_t kb=0 ; kb<Nblocks ; kb++) +// +// +// +// for (size_t i=0 ; i<N ; i++) +// for (size_t j=0 ; j<N ; j++) +// for (size_t k=0 ; k<_N ; k++) +// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; +// +// } // jb +// } // ib +// - Exploit shared-memory. +//////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 31.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp +// - Run the code: +// $ ./classwork_omp +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <time.h> +#include <assert.h> +#include <omp.h> +#include <string.h> + +#define N 512 +#define SIZE (N * N) // matrix size +typedef double MyData; // do not change +#define BLOCKSIZE 32 // number of threads per block + +// sanity check +#if BLOCKSIZE > 1024 +#error BLOCKSIZE cannot be larger than 1024 +#endif + +#define LOOP 100 +#define NDEBUG + +double wall_time() +{ + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; + + return ret; +} + +void CPU_mat_mult(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + for (size_t i=0 ; i<size ; i++) + for (size_t j=0 ; j<size ; j++) + for (size_t k=0 ; k<size ; k++) + C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]); + + return; +} + +void GPU_mat_mult(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + #pragma omp target + { + #pragma omp teams distribute num_teams(size) + for (size_t i=0 ; i<size ; i++) + { + #pragma omp parallel for num_threads(size) + for (size_t j=0 ; j<size ; j++) + { + MyData value = (MyData)0; + for (size_t k=0 ; k<size ; k++) + value += (A[(i * size) + k] * B[(k * size) + j]); + + C[(i * size) + j] = value; + } // omp thread + } // omp teams + } // omp target + + return; +} + +void GPU_mat_mult_no_loops(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + #pragma omp target + { + #pragma omp teams num_teams(size) + { + const size_t team_size = (size * omp_get_team_num()); + + #pragma omp parallel firstprivate(team_size) num_threads(size) + { + const size_t tid = omp_get_thread_num(); + MyData value = (MyData)0; + for (size_t k=0 ; k<size ; k++) + value += (A[team_size + k] * B[(k * size) + tid]); + + C[team_size + tid] = value; + } // omp threads + } // omp teams + } // omp target + + return; +} + +void check(const MyData *const __restrict__ cpu_matrix, + const MyData *const __restrict__ gpu_matrix) +{ + int flag; + for (size_t i=0 ; i<SIZE ; i++) + flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0); + + if (!flag) + printf("\n\t Result OK"); + else + printf("\n\t Result wrong"); + + return; +} + +int main() +{ + double time; + MyData *buffer = (MyData *)calloc(4 * SIZE, sizeof(MyData)); + assert(buffer != NULL); + + // host reference matrix A + MyData *const restrict A = buffer; + MyData *const restrict B = A + SIZE; + MyData *const restrict C_CPU = B + SIZE; + MyData *const restrict C_GPU = C_CPU + SIZE; + for (size_t i=0 ; i<SIZE ; i++) + { + A[i] = drand48(); + B[i] = drand48(); + } + + ////////////////////////// CPU naive algorithm ////////////////////////////////////////// + CPU_mat_mult(A, B, C_CPU, N); + ///////////////////////////////////////////////////////////////////////////////////////// + + // copy/alloc data to the GPU + #pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE]) + + /////////////////////////// GPU naive algorithm //////////////////////////////////////// + time = 0.0; + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + const double start = wall_time(); + GPU_mat_mult(A, B, C_GPU, N); + time += (wall_time() - start); + } + + #pragma omp target update from(C_GPU[0:SIZE]) + check(C_CPU, C_GPU); + printf("\n\t GPU naive time %lg [s]\n", (time / LOOP)); + //////////////////////////////////////////////////////////////////////////////// + + /////////////////////////// GPU naive no loops algorithm //////////////////////////// + time = 0.0; + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + const double start = wall_time(); + GPU_mat_mult_no_loops(A, B, C_GPU, N); + time += (wall_time() - start); + } + + #pragma omp target update from(C_GPU[0:SIZE]) + check(C_CPU, C_GPU); + printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP)); + //////////////////////////////////////////////////////////////////////////////// + + // free CPU memory + free(buffer); + // free GPU memory + #pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C_CPU[0:SIZE]) + + printf("\n"); + + return EXIT_SUCCESS; +} diff --git a/cuda-omp/omp/miscellaneous/structure.c b/cuda-omp/omp/miscellaneous/structure.c index c12b212f98c7a4d84a3c80829dcd2c305b47295e..fc0041c02e55eb438685ab113ca2da41b7b658a0 100644 --- a/cuda-omp/omp/miscellaneous/structure.c +++ b/cuda-omp/omp/miscellaneous/structure.c @@ -17,7 +17,8 @@ #include <assert.h> #include <omp.h> -#define N 8 +#define SIZE 8 +#define SIZE_2 (SIZE / 2) typedef double MyData; typedef struct my_span @@ -29,26 +30,20 @@ typedef struct my_span void allocate( span *my_struct, const size_t size) { - span tmp; /* allocate the buffer on the host memory */ - tmp.ptr = (MyData *)malloc(size * sizeof(MyData)); - assert(tmp.ptr != NULL); - tmp.N = size; - - /* declare how the object 'span' has to be mapped on the device */ - #pragma omp declare mapper(span tmp) map(from: tmp, tmp.ptr[0: tmp.N]) - - my_struct->ptr = tmp.ptr; - my_struct->N = tmp.N; + my_struct->ptr = (MyData *)calloc(size, sizeof(MyData)); + assert(my_struct->ptr != NULL); + my_struct->N = size; return; } -void print(const span *const A) +void print(const span *const A, + const char *const string) { printf("\n"); - for (size_t i=0 ; i<A->N ; i++) - printf("\n\t array[%i] = %lg", i, A->ptr[i]); + for (int i=0 ; i<A->N ; i++) + printf("\n\t %s[%d] = %lg", string, i, A->ptr[i]); printf("\n\n"); return; @@ -56,27 +51,41 @@ void print(const span *const A) int main() { - span A, B; + span A, B, C; - allocate(&A, N); - allocate(&B, N); + allocate(&A, SIZE); + allocate(&B, SIZE); + allocate(&C, SIZE); + + /* declare how the object 'span' has to be mapped on the device */ + #pragma omp declare mapper(all : span S) map(to: S) map(from: S.ptr[0 : S.N]) + #pragma omp declare mapper(left : span S) map(to: S) map(from: S.ptr[0 : S.N/2]) + #pragma omp declare mapper(right: span S) map(to: S) map(from: S.ptr[S.N/2: S.N/2]) /* init on the GPU */ - #pragma omp target +#pragma omp target map(mapper(all): A) map(mapper(left): B) map(mapper(right): C) { - for (size_t i=0 ; i<N ; i++) - { - A.ptr[i] = (MyData)(i); - B.ptr[i] = (MyData)(2 * i); - } + #pragma omp loop + for (size_t i=0 ; i<SIZE ; i++) + A.ptr[i] = (MyData)(i); + + #pragma omp loop + for (size_t ii=0 ; ii<SIZE_2 ; ii++) + B.ptr[ii] = (MyData)(ii); + + #pragma omp loop + for (size_t iii=SIZE_2 ; iii<SIZE ; iii++) + C.ptr[iii] = (MyData)(iii); } - print(&A); - print(&B); + print(&A, "A"); + print(&B, "B"); + print(&C, "C"); /* free the host's memory */ free(A.ptr); free(B.ptr); + free(C.ptr); return 0; } diff --git a/jacobi/serial/not_opt/jacobi_serial_not_opt_len14 b/jacobi/serial/not_opt/jacobi_serial_not_opt_len14 new file mode 100755 index 0000000000000000000000000000000000000000..f546329f4a6d492f2c2a4495e5a65dbeb9ea452d Binary files /dev/null and b/jacobi/serial/not_opt/jacobi_serial_not_opt_len14 differ