Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
HPC_Imaging
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Claudio Gheller
HPC_Imaging
Commits
5f3d8132
Commit
5f3d8132
authored
Mar 30, 2022
by
Nandhana Sakhtivel
Browse files
Options
Downloads
Patches
Plain Diff
Adding numa files
parent
21f66668
No related branches found
No related tags found
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
numa.c
+393
-0
393 additions, 0 deletions
numa.c
numa.h
+52
-0
52 additions, 0 deletions
numa.h
with
445 additions
and
0 deletions
numa.c
0 → 100644
+
393
−
0
View file @
5f3d8132
#include
"allvars.h"
#include
"proto.h"
#include
<stdio.h>
#include
<unistd.h>
#include
<limits.h>
map_t
Me
;
MPI_Comm
COMM
[
HLEVELS
];
char
*
LEVEL_NAMES
[
HLEVELS
]
=
{
"NUMA"
,
"ISLAND"
,
"myHOST"
,
"HOSTS"
,
"WORLD"
};
MPI_Aint
win_host_master_size
=
0
;
MPI_Aint
win_host__size
=
0
;
MPI_Aint
win_host_master_size
;
MPI_Win
win_host_master
;
int
win_host_master_disp
;
void
*
win_host_master_ptr
;
int
build_numa_mapping
(
int
,
int
,
MPI_Comm
*
,
map_t
*
);
int
map_hostnames
(
MPI_Comm
*
,
int
,
int
,
map_t
*
);
int
get_cpu_id
(
void
);
int
compare_string_int_int
(
const
void
*
,
const
void
*
);
int
init_numa
(
int
Rank
,
int
Size
,
MPI_Comm
*
MYWORLD
,
map_t
*
Me
)
{
// build up the numa hierarchy
//
build_numa_mapping
(
rank
,
size
,
MYWORLD
,
Me
);
// initialize sizes for the persistent
// shared windos
win_host_master_size
=
WIN_HOST_MASTER_SIZE_DFLT
*
1024
*
1024
;
MPI_Aint
win_host_size
=
WIN_HOST_SIZE_DFLT
*
1024
*
1024
;
// initialize the persistent shared windows
//
int
SHMEMl
=
Me
->
SHMEMl
;
MPI_Info
winfo
;
MPI_Info_create
(
&
winfo
);
MPI_Info_set
(
winfo
,
"alloc_shared_noncontig"
,
"true"
);
Me
->
win
.
size
=
win_host_size
;
MPI_Win_allocate_shared
(
Me
->
win
.
size
,
1
,
winfo
,
*
Me
->
COMM
[
SHMEMl
],
&
(
Me
->
win
.
ptr
),
&
(
Me
->
win
.
win
));
MPI_Aint
size
=
(
Me
->
Rank
[
SHMEMl
]
==
0
?
win_host_master_size
:
0
);
MPI_Win_allocate_shared
(
size
,
1
,
winfo
,
*
Me
->
COMM
[
SHMEMl
],
&
win_host_master_ptr
,
&
win_host_master
);
Me
->
swins
=
(
win_t
*
)
malloc
(
Me
->
Ntasks
[
SHMEMl
]
*
sizeof
(
win_t
)
);
// Me->swins = (win_t*)malloc(Me->Ntasks[SHMEMl]*sizeof(win_t));
// get the addresses of all the windows from my siblings
// at my shared-memory level
//
for
(
int
t
=
0
;
t
<
Me
->
Ntasks
[
SHMEMl
];
t
++
)
if
(
t
!=
Me
->
Rank
[
SHMEMl
]
)
MPI_Win_shared_query
(
Me
->
win
.
win
,
t
,
&
(
Me
->
swins
[
t
].
size
),
&
(
Me
->
swins
[
t
].
disp
),
&
(
Me
->
swins
[
t
].
ptr
)
);
if
(
Me
->
Rank
[
SHMEMl
]
!=
0
)
MPI_Win_shared_query
(
win_host_master
,
0
,
&
(
win_host_master_size
),
&
win_host_master_disp
,
&
win_host_master_ptr
);
}
int
shutdown_numa
(
int
Rank
,
int
Size
,
MPI_Comm
*
MYWORLD
,
map_t
*
Me
)
{
// free every shared memory and window
//
MPI_Win_free
(
&
(
Me
->
win
.
win
));
// free all the structures if needed
//
free
(
Me
->
Ranks_to_host
);
free
(
Me
->
swins
);
// anything else
//
// ...
}
int
build_numa_mapping
(
int
Rank
,
int
Size
,
MPI_Comm
*
MYWORLD
,
map_t
*
Me
)
{
COMM
[
WORLD
]
=
*
MYWORLD
;
Me
->
Ntasks
[
WORLD
]
=
Size
;
Me
->
Rank
[
WORLD
]
=
Rank
;
Me
->
COMM
[
WORLD
]
=
&
COMM
[
WORLD
];
Me
->
mycpu
=
get_cpu_id
();
// --- find how many hosts we are running on;
// that is needed to build the communicator
// among the masters of each host
//
map_hostnames
(
&
COMM
[
WORLD
],
Rank
,
Size
,
Me
);
Me
->
MAXl
=
(
Me
->
Nhosts
>
1
?
HOSTS
:
myHOST
);
// --- create the communicator for each host
//
MPI_Comm_split
(
COMM
[
WORLD
],
Me
->
myhost
,
Me
->
Rank
[
WORLD
],
&
COMM
[
myHOST
]);
MPI_Comm_size
(
COMM
[
myHOST
],
&
Size
);
MPI_Comm_rank
(
COMM
[
myHOST
],
&
Rank
);
Me
->
COMM
[
myHOST
]
=
&
COMM
[
myHOST
];
Me
->
Rank
[
myHOST
]
=
Rank
;
Me
->
Ntasks
[
myHOST
]
=
Size
;
// --- create the communicator for the
// masters of each host
//
int
Im_host_master
=
(
Me
->
Rank
[
myHOST
]
==
0
);
MPI_Comm_split
(
COMM
[
WORLD
],
Im_host_master
,
Me
->
Rank
[
WORLD
],
&
COMM
[
HOSTS
]);
//
// NOTE: by default, the Rank 0 in WORLD is also Rank 0 in HOSTS
//
if
(
Im_host_master
)
{
Me
->
COMM
[
HOSTS
]
=
&
COMM
[
HOSTS
];
Me
->
Ntasks
[
HOSTS
]
=
Me
->
Nhosts
;
MPI_Comm_rank
(
COMM
[
HOSTS
],
&
(
Me
->
Rank
[
HOSTS
]));
}
else
{
Me
->
COMM
[
HOSTS
]
=
NULL
;
Me
->
Ntasks
[
HOSTS
]
=
0
;
Me
->
Rank
[
HOSTS
]
=
-
1
;
}
// --- create the communicator for the
// numa node
//
MPI_Comm_split_type
(
COMM
[
myHOST
],
MPI_COMM_TYPE_SHARED
,
Me
->
Rank
[
myHOST
],
MPI_INFO_NULL
,
&
COMM
[
NUMA
]);
Me
->
COMM
[
NUMA
]
=
&
COMM
[
NUMA
];
MPI_Comm_size
(
COMM
[
NUMA
],
&
(
Me
->
Ntasks
[
NUMA
]));
MPI_Comm_rank
(
COMM
[
NUMA
],
&
(
Me
->
Rank
[
NUMA
]));
// check whether NUMA == myHOST and determine
// the maximum level of shared memory in the
// topology
//
if
(
Me
->
Ntasks
[
NUMA
]
==
Me
->
Ntasks
[
myHOST
]
)
{
// collapse levels from NUMA to myHOST
//
Me
->
Ntasks
[
ISLAND
]
=
Me
->
Ntasks
[
NUMA
];
// equating to NUMA as we know the rank better via MPI_SHARED
Me
->
Rank
[
ISLAND
]
=
Me
->
Rank
[
NUMA
];
Me
->
COMM
[
ISLAND
]
=
Me
->
COMM
[
NUMA
];
Me
->
Rank
[
myHOST
]
=
Me
->
Rank
[
NUMA
];
Me
->
COMM
[
myHOST
]
=
Me
->
COMM
[
NUMA
];
Me
->
SHMEMl
=
myHOST
;
}
else
{
// actually we do not care for this case
// at this moment
printf
(
">>> It seems that rank %d belongs to a node for which "
" the node topology does not coincide
\n
"
,
Rank
);
Me
->
SHMEMl
=
NUMA
;
}
int
check_SHMEM_level
=
1
;
int
globalcheck_SHMEM_level
;
int
globalmax_SHMEM_level
;
MPI_Allreduce
(
&
(
Me
->
SHMEMl
),
&
globalmax_SHMEM_level
,
1
,
MPI_INT
,
MPI_MAX
,
*
MYWORLD
);
check_SHMEM_level
=
(
(
Me
->
SHMEMl
==
myHOST
)
&&
(
globalmax_SHMEM_level
==
Me
->
SHMEMl
)
);
MPI_Allreduce
(
&
check_SHMEM_level
,
&
globalcheck_SHMEM_level
,
1
,
MPI_INT
,
MPI_MAX
,
*
MYWORLD
);
if
(
globalcheck_SHMEM_level
<
1
)
{
if
(
Rank
==
0
)
{
printf
(
"There was an error in determining the topology hierarchy, "
"SHMEM level is different for different MPI tasks
\n
"
);
return
-
1
;
}
}
return
0
;
}
int
map_hostnames
(
MPI_Comm
*
MY_WORLD
,
// the communicator to refer to
int
Rank
,
// the initial rank of the calling process in MYWORLD
int
Ntasks
,
// the number of tasks in MY_WORLD
map_t
*
me
)
// address of the info structure for the calling task
{
// --------------------------------------------------
// --- init some global vars
me
->
Ranks_to_host
=
(
int
*
)
malloc
(
Ntasks
*
sizeof
(
int
));
//me -> Ranks_to_host = (int*)malloc(Ntasks*sizeof(int));
me
->
Nhosts
=
0
;
me
->
myhost
=
-
1
;
// --------------------------------------------------
// --- find how many hosts we are using
char
myhostname
[
HOST_NAME_MAX
+
1
];
gethostname
(
myhostname
,
HOST_NAME_MAX
+
1
);
// determine how much space to book for hostnames
int
myhostlen
=
strlen
(
myhostname
)
+
1
;
int
maxhostlen
=
0
;
MPI_Allreduce
(
&
myhostlen
,
&
maxhostlen
,
1
,
MPI_INT
,
MPI_MAX
,
*
MY_WORLD
);
// collect hostnames
//
typedef
struct
{
char
hostname
[
maxhostlen
];
int
rank
;
}
hostname_rank_t
;
hostname_rank_t
mydata
;
hostname_rank_t
*
alldata
=
(
hostname_rank_t
*
)
calloc
(
Ntasks
,
sizeof
(
hostname_rank_t
)
);
mydata
.
rank
=
Rank
;
sprintf
(
mydata
.
hostname
,
"%s"
,
myhostname
);
MPI_Allgather
(
&
mydata
,
sizeof
(
hostname_rank_t
),
MPI_BYTE
,
alldata
,
sizeof
(
hostname_rank_t
),
MPI_BYTE
,
*
MY_WORLD
);
// sort the hostnames
// 1) set the lenght of string for comparison
int
dummy
=
maxhostlen
;
compare_string_int_int
(
NULL
,
&
dummy
);
// 2) actually sort
qsort
(
alldata
,
Ntasks
,
sizeof
(
hostname_rank_t
),
compare_string_int_int
);
// now the array alldata is sorted by hostname, and inside each hostname the processes
// running on each host are sorted by their node, and for each node they are sorted
// by ht.
// As a direct consequence, the running index on the alldata array can be considered
// as the new global rank of each process
// --- count how many diverse hosts we have, and register each rank to its host, so that
// we can alway find all the tasks with their original rank
char
*
prev
=
alldata
[
0
].
hostname
;
for
(
int
R
=
0
;
R
<
Ntasks
;
R
++
)
{
if
(
strcmp
(
alldata
[
R
].
hostname
,
prev
)
!=
0
)
{
me
->
Nhosts
++
;
prev
=
alldata
[
R
].
hostname
;
}
if
(
alldata
[
R
].
rank
==
Rank
)
// it's me
me
->
myhost
=
me
->
Nhosts
;
// remember my host
}
me
->
Nhosts
++
;
// with the following gathering we build-up the mapping Ranks_to_hosts, so that
// we know which host each mpi rank (meaning the original rank) belongs to
//
MPI_Allgather
(
&
me
->
myhost
,
sizeof
(
me
->
myhost
),
MPI_BYTE
,
me
->
Ranks_to_host
,
sizeof
(
me
->
myhost
),
MPI_BYTE
,
*
MY_WORLD
);
free
(
alldata
);
return
me
->
Nhosts
;
}
int
compare_string_int_int
(
const
void
*
A
,
const
void
*
B
)
// used to sort structures made as
// { char *s;
// int b;
// ... }
// The sorting is hierarchical by *s first, then b
// if necessary
// The length of *s is set by calling
// compare_string_int_int( NULL, len )
// before to use this routine in qsort-like calls
{
static
int
str_len
=
0
;
if
(
A
==
NULL
)
{
str_len
=
*
(
int
*
)
B
+
1
;
return
0
;
}
// we do not use strncmp because str_len=0,
// i.e. using this function without initializing it,
// can be used to have a sorting only on
// strings
int
order
=
strcmp
(
(
char
*
)
A
,
(
char
*
)
B
);
if
(
str_len
&&
(
!
order
)
)
{
int
a
=
*
(
int
*
)((
char
*
)
A
+
str_len
);
int
b
=
*
(
int
*
)((
char
*
)
B
+
str_len
);
order
=
a
-
b
;
if
(
!
order
)
{
int
a
=
*
((
int
*
)((
char
*
)
A
+
str_len
)
+
1
);
int
b
=
*
((
int
*
)((
char
*
)
B
+
str_len
)
+
1
);
order
=
a
-
b
;
}
}
return
order
;
}
#define CPU_ID_ENTRY_IN_PROCSTAT 39
int
read_proc__self_stat
(
int
,
int
*
);
int
get_cpu_id
(
void
)
{
#if defined(_GNU_SOURCE) // GNU SOURCE ------------
return
sched_getcpu
(
);
#else
#ifdef SYS_getcpu // direct sys call ---
int
cpuid
;
if
(
syscall
(
SYS_getcpu
,
&
cpuid
,
NULL
,
NULL
)
==
-
1
)
return
-
1
;
else
return
cpuid
;
#else
unsigned
val
;
if
(
read_proc__self_stat
(
CPU_ID_ENTRY_IN_PROCSTAT
,
&
val
)
==
-
1
)
return
-
1
;
return
(
int
)
val
;
#endif // -----------------------
#endif
}
int
read_proc__self_stat
(
int
field
,
int
*
ret_val
)
/*
Other interesting fields:
pid : 0
father : 1
utime : 13
cutime : 14
nthreads : 18
rss : 22
cpuid : 39
read man /proc page for fully detailed infos
*/
{
// not used, just mnemonic
// char *table[ 52 ] = { [0]="pid", [1]="father", [13]="utime", [14]="cutime", [18]="nthreads", [22]="rss", [38]="cpuid"};
*
ret_val
=
0
;
FILE
*
file
=
fopen
(
"/proc/self/stat"
,
"r"
);
if
(
file
==
NULL
)
return
-
1
;
char
*
line
=
NULL
;
int
ret
;
size_t
len
;
ret
=
getline
(
&
line
,
&
len
,
file
);
fclose
(
file
);
if
(
ret
==
-
1
)
return
-
1
;
char
*
savetoken
=
line
;
char
*
token
=
strtok_r
(
line
,
" "
,
&
savetoken
);
--
field
;
do
{
token
=
strtok_r
(
NULL
,
" "
,
&
savetoken
);
field
--
;
}
while
(
field
);
*
ret_val
=
atoi
(
token
);
free
(
line
);
return
0
;
}
This diff is collapsed.
Click to expand it.
numa.h
0 → 100644
+
52
−
0
View file @
5f3d8132
#define NUMA 0 // my NUMA node communicator, includes all the sibling tasks that share memory
#define ISLAND 1 // something between the host and the NUMA nodes, if present
#define myHOST 2 // my host communicator, includes all the sibling tasks running on the same hosts
#define HOSTS 3 // the communicator that includes only the masters of the hosts
#define WORLD 4 // everybody is in (i.e. this is MPI_COMM_WORLD)
#define HLEVELS 5
extern
char
*
LEVEL_NAMES
[
HLEVELS
];
typedef
struct
{
MPI_Win
win
;
MPI_Aint
size
;
void
*
ptr
;
int
disp
;
}
win_t
;
typedef
struct
{
int
mycpu
;
// the core (hwthread) on which i'm running
int
nthreads
;
// how many (omp) thread do i have
int
myhost
;
// the host on which i'm running
int
Nhosts
;
int
Ntasks
[
HLEVELS
];
int
*
Ranks_to_host
;
// check if it is needed
int
Rank
[
HLEVELS
];
int
MAXl
;
// the maximum level of the hierarchy
int
SHMEMl
;
// the maximum hierarchy level that is in shared memory
MPI_Comm
*
COMM
[
HLEVELS
];
// -----------------------
// not yet used
// int mynode; // the numa node on which i'm running
// int ntasks_in_my_node;
win_t
win
;
// my shared-memory window
win_t
*
swins
;
// the shared-memory windows of ther tasks in my host
}
map_t
;
extern
map_t
Me
;
extern
MPI_Comm
COMM
[
HLEVELS
];
#define WIN_HOST_SIZE_DFLT 100 // in MB
#define WIN_HOST_MASTER_SIZE_DFLT 100 // in MB
extern
MPI_Aint
win_host_master_size
;
extern
MPI_Win
win_host_master
;
extern
int
win_host_master_disp
;
extern
void
*
win_host_master_ptr
;
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment