Skip to content
Snippets Groups Projects
Commit 31d1aa2e authored by David Goz's avatar David Goz :sleeping:
Browse files

omp/miscellaneous/multiple_device example

parent aa704d3b
Branches
No related tags found
No related merge requests found
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <assert.h>
typedef int MyData;
#define N_PER_DEV 1000000
#define BLOCKSIZE 256
#if (BLOCKSIZE < 32) || (BLOCKSIZE > 1024)
#error "32 <= BLOCKSIZE <= 1024"
#endif
#if (N_PER_DEV < BLOCKSIZE)
#error "N_PER_DEV < BLOCKSIZE"
#endif
#define NDEBUG
void check(const MyData *const restrict vector_cpu,
const MyData *const restrict vector_gpu,
const size_t size)
{
int flag = 0;
for (size_t i=0 ; i<size ; i++)
{
#if !defined(NDEBUG)
printf("\n\t vector_cpu[%zu] = %d - vector_gpu[%zu] = %d",
i, vector_cpu[i], i, vector_gpu[i]);
#endif
flag = ((vector_cpu[i] != vector_gpu[i]) ? 1 : flag);
}
if (flag)
printf("\n\t Result wrong \n");
else
printf("\n\t Result OK \n");
return;
}
void VectorAdd(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const int offset,
const int size,
const int dev,
const int nblocks)
{
#pragma omp target \
teams num_teams(nblocks) thread_limit(BLOCKSIZE) \
map(to: A[offset:size], B[offset:size]) map(from: C[offset:size]) \
device(dev)
{
const int team = omp_get_team_num();
const int team_start_index = (team * BLOCKSIZE) + offset;
const int team_end_index = team_start_index + BLOCKSIZE;
#pragma omp parallel num_threads(BLOCKSIZE)
{
const int localID = omp_get_thread_num();
const int block = omp_get_num_threads();
int globalID = team_start_index + localID;
for (int index=globalID ; index<team_end_index ; index+=block)
C[index] = A[index] + B[index];
#if !defined(NDEBUG)
if ((localID == 0) && (team == 0))
printf("\n\t Device: %d - Teams: %d [requested: %d]- Thread per team: %d [requested: %d]",
dev, omp_get_num_teams(), nblocks, block, BLOCKSIZE);
#endif
} // omp parallel
} // omp target
return;
}
int main()
{
// get the number of the available devices
const int NumDev = omp_get_num_devices();
// global vector size
const int size = (NumDev * N_PER_DEV);
assert(size > 0);
MyData *buffer = (MyData *)malloc(4 * size * sizeof(MyData));
assert(buffer != NULL);
MyData *const restrict A = buffer;
MyData *const restrict B = A + size;
MyData *const restrict C_CPU = B + size;
MyData *const restrict C_GPU = C_CPU + size;
#pragma omp parallel for simd
for (int i=0 ; i<size ; i++)
{
A[i] = rand() % N_PER_DEV;
B[i] = rand() % N_PER_DEV;
C_CPU[i] = A[i] + B[i];
}
#pragma omp parallel num_threads(NumDev)
{
// check
#pragma omp single
{
if (NumDev != omp_get_num_threads())
exit(EXIT_FAILURE);
else
{
printf("\n\t Using %d GPUs \n", NumDev);
fflush(stdout);
}
} // implicit barrier
const int tid = omp_get_thread_num();
const int offset = (tid * N_PER_DEV);
const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE);
VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks);
} // omp parallel
check(C_CPU, C_GPU, size);
free(buffer);
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment