diff --git a/cuda-omp/omp/miscellaneous/multiple_devices.c b/cuda-omp/omp/miscellaneous/multiple_devices.c new file mode 100644 index 0000000000000000000000000000000000000000..4eb5cad58acadad0a75af467a9f68d45e7096a41 --- /dev/null +++ b/cuda-omp/omp/miscellaneous/multiple_devices.c @@ -0,0 +1,132 @@ +#include <stdio.h> +#include <stdlib.h> +#include <omp.h> +#include <assert.h> + +typedef int MyData; +#define N_PER_DEV 1000000 +#define BLOCKSIZE 256 + +#if (BLOCKSIZE < 32) || (BLOCKSIZE > 1024) +#error "32 <= BLOCKSIZE <= 1024" +#endif + +#if (N_PER_DEV < BLOCKSIZE) +#error "N_PER_DEV < BLOCKSIZE" +#endif + +#define NDEBUG + +void check(const MyData *const restrict vector_cpu, + const MyData *const restrict vector_gpu, + const size_t size) +{ + int flag = 0; + for (size_t i=0 ; i<size ; i++) +{ +#if !defined(NDEBUG) + printf("\n\t vector_cpu[%zu] = %d - vector_gpu[%zu] = %d", + i, vector_cpu[i], i, vector_gpu[i]); +#endif + + flag = ((vector_cpu[i] != vector_gpu[i]) ? 1 : flag); +} + + if (flag) + printf("\n\t Result wrong \n"); + else + printf("\n\t Result OK \n"); + + return; +} + +void VectorAdd(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const int offset, + const int size, + const int dev, + const int nblocks) +{ + #pragma omp target \ + teams num_teams(nblocks) thread_limit(BLOCKSIZE) \ + map(to: A[offset:size], B[offset:size]) map(from: C[offset:size]) \ + device(dev) + { + const int team = omp_get_team_num(); + const int team_start_index = (team * BLOCKSIZE) + offset; + const int team_end_index = team_start_index + BLOCKSIZE; + + #pragma omp parallel num_threads(BLOCKSIZE) + { + const int localID = omp_get_thread_num(); + const int block = omp_get_num_threads(); + + int globalID = team_start_index + localID; + + for (int index=globalID ; index<team_end_index ; index+=block) + C[index] = A[index] + B[index]; + +#if !defined(NDEBUG) + + if ((localID == 0) && (team == 0)) + printf("\n\t Device: %d - Teams: %d [requested: %d]- Thread per team: %d [requested: %d]", + dev, omp_get_num_teams(), nblocks, block, BLOCKSIZE); +#endif + } // omp parallel + } // omp target + + return; +} + +int main() +{ + // get the number of the available devices + const int NumDev = omp_get_num_devices(); + + // global vector size + const int size = (NumDev * N_PER_DEV); + assert(size > 0); + + MyData *buffer = (MyData *)malloc(4 * size * sizeof(MyData)); + assert(buffer != NULL); + MyData *const restrict A = buffer; + MyData *const restrict B = A + size; + MyData *const restrict C_CPU = B + size; + MyData *const restrict C_GPU = C_CPU + size; + + #pragma omp parallel for simd + for (int i=0 ; i<size ; i++) + { + A[i] = rand() % N_PER_DEV; + B[i] = rand() % N_PER_DEV; + C_CPU[i] = A[i] + B[i]; + } + + #pragma omp parallel num_threads(NumDev) + { + // check + #pragma omp single + { + if (NumDev != omp_get_num_threads()) + exit(EXIT_FAILURE); + else + { + printf("\n\t Using %d GPUs \n", NumDev); + fflush(stdout); + } + } // implicit barrier + + const int tid = omp_get_thread_num(); + const int offset = (tid * N_PER_DEV); + const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE); + + VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks); + } // omp parallel + + check(C_CPU, C_GPU, size); + + free(buffer); + + return 0; +}