Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
HPC_SCHOOL_2024
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
HPC_SCHOOL
HPC_SCHOOL_2024
Commits
31d1aa2e
Commit
31d1aa2e
authored
11 months ago
by
David Goz
Browse files
Options
Downloads
Patches
Plain Diff
omp/miscellaneous/multiple_device example
parent
aa704d3b
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
cuda-omp/omp/miscellaneous/multiple_devices.c
+132
-0
132 additions, 0 deletions
cuda-omp/omp/miscellaneous/multiple_devices.c
with
132 additions
and
0 deletions
cuda-omp/omp/miscellaneous/multiple_devices.c
0 → 100644
+
132
−
0
View file @
31d1aa2e
#include
<stdio.h>
#include
<stdlib.h>
#include
<omp.h>
#include
<assert.h>
typedef
int
MyData
;
#define N_PER_DEV 1000000
#define BLOCKSIZE 256
#if (BLOCKSIZE < 32) || (BLOCKSIZE > 1024)
#error "32 <= BLOCKSIZE <= 1024"
#endif
#if (N_PER_DEV < BLOCKSIZE)
#error "N_PER_DEV < BLOCKSIZE"
#endif
#define NDEBUG
void
check
(
const
MyData
*
const
restrict
vector_cpu
,
const
MyData
*
const
restrict
vector_gpu
,
const
size_t
size
)
{
int
flag
=
0
;
for
(
size_t
i
=
0
;
i
<
size
;
i
++
)
{
#if !defined(NDEBUG)
printf
(
"
\n\t
vector_cpu[%zu] = %d - vector_gpu[%zu] = %d"
,
i
,
vector_cpu
[
i
],
i
,
vector_gpu
[
i
]);
#endif
flag
=
((
vector_cpu
[
i
]
!=
vector_gpu
[
i
])
?
1
:
flag
);
}
if
(
flag
)
printf
(
"
\n\t
Result wrong
\n
"
);
else
printf
(
"
\n\t
Result OK
\n
"
);
return
;
}
void
VectorAdd
(
const
MyData
*
const
restrict
A
,
const
MyData
*
const
restrict
B
,
MyData
*
const
restrict
C
,
const
int
offset
,
const
int
size
,
const
int
dev
,
const
int
nblocks
)
{
#pragma omp target \
teams num_teams(nblocks) thread_limit(BLOCKSIZE) \
map(to: A[offset:size], B[offset:size]) map(from: C[offset:size]) \
device(dev)
{
const
int
team
=
omp_get_team_num
();
const
int
team_start_index
=
(
team
*
BLOCKSIZE
)
+
offset
;
const
int
team_end_index
=
team_start_index
+
BLOCKSIZE
;
#pragma omp parallel num_threads(BLOCKSIZE)
{
const
int
localID
=
omp_get_thread_num
();
const
int
block
=
omp_get_num_threads
();
int
globalID
=
team_start_index
+
localID
;
for
(
int
index
=
globalID
;
index
<
team_end_index
;
index
+=
block
)
C
[
index
]
=
A
[
index
]
+
B
[
index
];
#if !defined(NDEBUG)
if
((
localID
==
0
)
&&
(
team
==
0
))
printf
(
"
\n\t
Device: %d - Teams: %d [requested: %d]- Thread per team: %d [requested: %d]"
,
dev
,
omp_get_num_teams
(),
nblocks
,
block
,
BLOCKSIZE
);
#endif
}
// omp parallel
}
// omp target
return
;
}
int
main
()
{
// get the number of the available devices
const
int
NumDev
=
omp_get_num_devices
();
// global vector size
const
int
size
=
(
NumDev
*
N_PER_DEV
);
assert
(
size
>
0
);
MyData
*
buffer
=
(
MyData
*
)
malloc
(
4
*
size
*
sizeof
(
MyData
));
assert
(
buffer
!=
NULL
);
MyData
*
const
restrict
A
=
buffer
;
MyData
*
const
restrict
B
=
A
+
size
;
MyData
*
const
restrict
C_CPU
=
B
+
size
;
MyData
*
const
restrict
C_GPU
=
C_CPU
+
size
;
#pragma omp parallel for simd
for
(
int
i
=
0
;
i
<
size
;
i
++
)
{
A
[
i
]
=
rand
()
%
N_PER_DEV
;
B
[
i
]
=
rand
()
%
N_PER_DEV
;
C_CPU
[
i
]
=
A
[
i
]
+
B
[
i
];
}
#pragma omp parallel num_threads(NumDev)
{
// check
#pragma omp single
{
if
(
NumDev
!=
omp_get_num_threads
())
exit
(
EXIT_FAILURE
);
else
{
printf
(
"
\n\t
Using %d GPUs
\n
"
,
NumDev
);
fflush
(
stdout
);
}
}
// implicit barrier
const
int
tid
=
omp_get_thread_num
();
const
int
offset
=
(
tid
*
N_PER_DEV
);
const
int
nblocks
=
((
N_PER_DEV
+
BLOCKSIZE
-
1
)
/
BLOCKSIZE
);
VectorAdd
(
A
,
B
,
C_GPU
,
offset
,
N_PER_DEV
,
tid
,
nblocks
);
}
// omp parallel
check
(
C_CPU
,
C_GPU
,
size
);
free
(
buffer
);
return
0
;
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment