Skip to content
Snippets Groups Projects
Commit 5f3d8132 authored by Nandhana Sakhtivel's avatar Nandhana Sakhtivel
Browse files

Adding numa files

parent 21f66668
No related branches found
No related tags found
No related merge requests found
numa.c 0 → 100644
#include "allvars.h"
#include "proto.h"
#include <stdio.h>
#include <unistd.h>
#include <limits.h>
map_t Me;
MPI_Comm COMM[HLEVELS];
char *LEVEL_NAMES[HLEVELS] = {"NUMA", "ISLAND", "myHOST", "HOSTS", "WORLD"};
MPI_Aint win_host_master_size = 0;
MPI_Aint win_host__size = 0;
MPI_Aint win_host_master_size;
MPI_Win win_host_master;
int win_host_master_disp;
void *win_host_master_ptr;
int build_numa_mapping( int, int, MPI_Comm *, map_t *);
int map_hostnames( MPI_Comm *, int, int, map_t *);
int get_cpu_id( void );
int compare_string_int_int( const void *, const void * );
int init_numa( int Rank, int Size, MPI_Comm *MYWORLD, map_t *Me )
{
// build up the numa hierarchy
//
build_numa_mapping( rank, size, MYWORLD, Me );
// initialize sizes for the persistent
// shared windos
win_host_master_size = WIN_HOST_MASTER_SIZE_DFLT*1024*1024;
MPI_Aint win_host_size = WIN_HOST_SIZE_DFLT*1024*1024;
// initialize the persistent shared windows
//
int SHMEMl = Me->SHMEMl;
MPI_Info winfo;
MPI_Info_create(&winfo);
MPI_Info_set(winfo, "alloc_shared_noncontig", "true");
Me->win.size = win_host_size;
MPI_Win_allocate_shared(Me->win.size, 1, winfo, *Me->COMM[SHMEMl], &(Me->win.ptr), &(Me->win.win));
MPI_Aint size = ( Me->Rank[SHMEMl] == 0 ? win_host_master_size : 0);
MPI_Win_allocate_shared(size, 1, winfo, *Me->COMM[SHMEMl], &win_host_master_ptr, &win_host_master);
Me->swins = (win_t*)malloc(Me->Ntasks[SHMEMl]*sizeof(win_t) );
// Me->swins = (win_t*)malloc(Me->Ntasks[SHMEMl]*sizeof(win_t));
// get the addresses of all the windows from my siblings
// at my shared-memory level
//
for( int t = 0; t < Me->Ntasks[SHMEMl]; t++ )
if( t != Me->Rank[SHMEMl] )
MPI_Win_shared_query( Me->win.win, t, &(Me->swins[t].size), &(Me->swins[t].disp), &(Me->swins[t].ptr) );
if( Me->Rank[SHMEMl] != 0 )
MPI_Win_shared_query( win_host_master, 0, &(win_host_master_size), &win_host_master_disp, &win_host_master_ptr );
}
int shutdown_numa( int Rank, int Size, MPI_Comm *MYWORLD, map_t *Me )
{
// free every shared memory and window
//
MPI_Win_free(&(Me->win.win));
// free all the structures if needed
//
free(Me->Ranks_to_host);
free(Me->swins);
// anything else
//
// ...
}
int build_numa_mapping( int Rank, int Size, MPI_Comm *MYWORLD, map_t *Me )
{
COMM[WORLD] = *MYWORLD;
Me->Ntasks[WORLD] = Size;
Me->Rank[WORLD] = Rank;
Me->COMM[WORLD] = &COMM[WORLD];
Me->mycpu = get_cpu_id();
// --- find how many hosts we are running on;
// that is needed to build the communicator
// among the masters of each host
//
map_hostnames( &COMM[WORLD], Rank, Size, Me );
Me->MAXl = ( Me->Nhosts > 1 ? HOSTS : myHOST );
// --- create the communicator for each host
//
MPI_Comm_split( COMM[WORLD], Me->myhost, Me->Rank[WORLD], &COMM[myHOST]);
MPI_Comm_size( COMM[myHOST], &Size );
MPI_Comm_rank( COMM[myHOST], &Rank );
Me->COMM[myHOST] = &COMM[myHOST];
Me->Rank[myHOST] = Rank;
Me->Ntasks[myHOST] = Size;
// --- create the communicator for the
// masters of each host
//
int Im_host_master = ( Me->Rank[myHOST] == 0 );
MPI_Comm_split( COMM[WORLD], Im_host_master, Me->Rank[WORLD], &COMM[HOSTS]);
//
// NOTE: by default, the Rank 0 in WORLD is also Rank 0 in HOSTS
//
if (Im_host_master)
{
Me->COMM[HOSTS] = &COMM[HOSTS];
Me->Ntasks[HOSTS] = Me->Nhosts;
MPI_Comm_rank( COMM[HOSTS], &(Me->Rank[HOSTS]));
}
else
{
Me->COMM[HOSTS] = NULL;
Me->Ntasks[HOSTS] = 0;
Me->Rank[HOSTS] = -1;
}
// --- create the communicator for the
// numa node
//
MPI_Comm_split_type( COMM[myHOST], MPI_COMM_TYPE_SHARED, Me->Rank[myHOST], MPI_INFO_NULL, &COMM[NUMA]);
Me->COMM[NUMA] = &COMM[NUMA];
MPI_Comm_size( COMM[NUMA], &(Me->Ntasks[NUMA]));
MPI_Comm_rank( COMM[NUMA], &(Me->Rank[NUMA]));
// check whether NUMA == myHOST and determine
// the maximum level of shared memory in the
// topology
//
if ( Me->Ntasks[NUMA] == Me->Ntasks[myHOST] )
{
// collapse levels from NUMA to myHOST
//
Me->Ntasks[ISLAND] = Me->Ntasks[NUMA]; // equating to NUMA as we know the rank better via MPI_SHARED
Me->Rank[ISLAND] = Me->Rank[NUMA];
Me->COMM[ISLAND] = Me->COMM[NUMA];
Me->Rank[myHOST] = Me->Rank[NUMA];
Me->COMM[myHOST] = Me->COMM[NUMA];
Me->SHMEMl = myHOST;
}
else
{
// actually we do not care for this case
// at this moment
printf(">>> It seems that rank %d belongs to a node for which "
" the node topology does not coincide \n", Rank );
Me->SHMEMl = NUMA;
}
int check_SHMEM_level = 1;
int globalcheck_SHMEM_level;
int globalmax_SHMEM_level;
MPI_Allreduce( &(Me->SHMEMl), &globalmax_SHMEM_level, 1, MPI_INT, MPI_MAX, *MYWORLD );
check_SHMEM_level = ( (Me->SHMEMl == myHOST) && (globalmax_SHMEM_level == Me->SHMEMl) );
MPI_Allreduce( &check_SHMEM_level, &globalcheck_SHMEM_level, 1, MPI_INT, MPI_MAX, *MYWORLD );
if( globalcheck_SHMEM_level < 1 )
{
if( Rank == 0 ) {
printf("There was an error in determining the topology hierarchy, "
"SHMEM level is different for different MPI tasks\n");
return -1; }
}
return 0;
}
int map_hostnames( MPI_Comm *MY_WORLD, // the communicator to refer to
int Rank, // the initial rank of the calling process in MYWORLD
int Ntasks, // the number of tasks in MY_WORLD
map_t *me) // address of the info structure for the calling task
{
// --------------------------------------------------
// --- init some global vars
me -> Ranks_to_host = (int*)malloc(Ntasks*sizeof(int));
//me -> Ranks_to_host = (int*)malloc(Ntasks*sizeof(int));
me -> Nhosts = 0;
me -> myhost = -1;
// --------------------------------------------------
// --- find how many hosts we are using
char myhostname[HOST_NAME_MAX+1];
gethostname( myhostname, HOST_NAME_MAX+1 );
// determine how much space to book for hostnames
int myhostlen = strlen(myhostname)+1;
int maxhostlen = 0;
MPI_Allreduce ( &myhostlen, &maxhostlen, 1, MPI_INT, MPI_MAX, *MY_WORLD );
// collect hostnames
//
typedef struct {
char hostname[maxhostlen];
int rank;
} hostname_rank_t;
hostname_rank_t mydata;
hostname_rank_t *alldata = (hostname_rank_t*)calloc( Ntasks, sizeof(hostname_rank_t) );
mydata.rank = Rank;
sprintf( mydata.hostname, "%s", myhostname);
MPI_Allgather( &mydata, sizeof(hostname_rank_t), MPI_BYTE, alldata, sizeof(hostname_rank_t), MPI_BYTE, *MY_WORLD );
// sort the hostnames
// 1) set the lenght of string for comparison
int dummy = maxhostlen;
compare_string_int_int( NULL, &dummy );
// 2) actually sort
qsort( alldata, Ntasks, sizeof(hostname_rank_t), compare_string_int_int );
// now the array alldata is sorted by hostname, and inside each hostname the processes
// running on each host are sorted by their node, and for each node they are sorted
// by ht.
// As a direct consequence, the running index on the alldata array can be considered
// as the new global rank of each process
// --- count how many diverse hosts we have, and register each rank to its host, so that
// we can alway find all the tasks with their original rank
char *prev = alldata[0].hostname;
for ( int R = 0; R < Ntasks; R++ )
{
if ( strcmp(alldata[R].hostname, prev) != 0 ) {
me->Nhosts++; prev = alldata[R].hostname; }
if ( alldata[R].rank == Rank ) // it's me
me->myhost = me->Nhosts; // remember my host
}
me->Nhosts++;
// with the following gathering we build-up the mapping Ranks_to_hosts, so that
// we know which host each mpi rank (meaning the original rank) belongs to
//
MPI_Allgather( &me->myhost, sizeof(me->myhost), MPI_BYTE,
me->Ranks_to_host, sizeof(me->myhost), MPI_BYTE, *MY_WORLD );
free( alldata );
return me->Nhosts;
}
int compare_string_int_int( const void *A, const void *B )
// used to sort structures made as
// { char *s;
// int b;
// ... }
// The sorting is hierarchical by *s first, then b
// if necessary
// The length of *s is set by calling
// compare_string_int_int( NULL, len )
// before to use this routine in qsort-like calls
{
static int str_len = 0;
if ( A == NULL )
{
str_len = *(int*)B + 1;
return 0;
}
// we do not use strncmp because str_len=0,
// i.e. using this function without initializing it,
// can be used to have a sorting only on
// strings
int order = strcmp( (char*)A, (char*)B );
if ( str_len && (!order) )
{
int a = *(int*)((char*)A + str_len);
int b = *(int*)((char*)B + str_len);
order = a - b;
if( !order )
{
int a = *((int*)((char*)A + str_len)+1);
int b = *((int*)((char*)B + str_len)+1);
order = a - b;
}
}
return order;
}
#define CPU_ID_ENTRY_IN_PROCSTAT 39
int read_proc__self_stat( int, int * );
int get_cpu_id( void )
{
#if defined(_GNU_SOURCE) // GNU SOURCE ------------
return sched_getcpu( );
#else
#ifdef SYS_getcpu // direct sys call ---
int cpuid;
if ( syscall( SYS_getcpu, &cpuid, NULL, NULL ) == -1 )
return -1;
else
return cpuid;
#else
unsigned val;
if ( read_proc__self_stat( CPU_ID_ENTRY_IN_PROCSTAT, &val ) == -1 )
return -1;
return (int)val;
#endif // -----------------------
#endif
}
int read_proc__self_stat( int field, int *ret_val )
/*
Other interesting fields:
pid : 0
father : 1
utime : 13
cutime : 14
nthreads : 18
rss : 22
cpuid : 39
read man /proc page for fully detailed infos
*/
{
// not used, just mnemonic
// char *table[ 52 ] = { [0]="pid", [1]="father", [13]="utime", [14]="cutime", [18]="nthreads", [22]="rss", [38]="cpuid"};
*ret_val = 0;
FILE *file = fopen( "/proc/self/stat", "r" );
if (file == NULL )
return -1;
char *line = NULL;
int ret;
size_t len;
ret = getline( &line, &len, file );
fclose(file);
if( ret == -1 )
return -1;
char *savetoken = line;
char *token = strtok_r( line, " ", &savetoken);
--field;
do { token = strtok_r( NULL, " ", &savetoken); field--; } while( field );
*ret_val = atoi(token);
free(line);
return 0;
}
numa.h 0 → 100644
#define NUMA 0 // my NUMA node communicator, includes all the sibling tasks that share memory
#define ISLAND 1 // something between the host and the NUMA nodes, if present
#define myHOST 2 // my host communicator, includes all the sibling tasks running on the same hosts
#define HOSTS 3 // the communicator that includes only the masters of the hosts
#define WORLD 4 // everybody is in (i.e. this is MPI_COMM_WORLD)
#define HLEVELS 5
extern char *LEVEL_NAMES[HLEVELS];
typedef struct
{
MPI_Win win;
MPI_Aint size;
void *ptr;
int disp;
} win_t;
typedef struct
{
int mycpu; // the core (hwthread) on which i'm running
int nthreads; // how many (omp) thread do i have
int myhost; // the host on which i'm running
int Nhosts;
int Ntasks[HLEVELS];
int *Ranks_to_host; // check if it is needed
int Rank[HLEVELS];
int MAXl; // the maximum level of the hierarchy
int SHMEMl; // the maximum hierarchy level that is in shared memory
MPI_Comm *COMM[HLEVELS];
// -----------------------
// not yet used
// int mynode; // the numa node on which i'm running
// int ntasks_in_my_node;
win_t win; // my shared-memory window
win_t *swins; // the shared-memory windows of ther tasks in my host
} map_t;
extern map_t Me;
extern MPI_Comm COMM[HLEVELS];
#define WIN_HOST_SIZE_DFLT 100 // in MB
#define WIN_HOST_MASTER_SIZE_DFLT 100 // in MB
extern MPI_Aint win_host_master_size;
extern MPI_Win win_host_master;
extern int win_host_master_disp;
extern void *win_host_master_ptr;
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment