From 5f3d8132d8307962bd84fa1bd7830b6dc272a3db Mon Sep 17 00:00:00 2001 From: nandhanas Date: Wed, 30 Mar 2022 14:41:31 +0200 Subject: [PATCH] Adding numa files --- numa.c | 393 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ numa.h | 52 ++++++++ 2 files changed, 445 insertions(+) create mode 100644 numa.c create mode 100644 numa.h diff --git a/numa.c b/numa.c new file mode 100644 index 0000000..1165894 --- /dev/null +++ b/numa.c @@ -0,0 +1,393 @@ +#include "allvars.h" +#include "proto.h" +#include +#include +#include + +map_t Me; +MPI_Comm COMM[HLEVELS]; + +char *LEVEL_NAMES[HLEVELS] = {"NUMA", "ISLAND", "myHOST", "HOSTS", "WORLD"}; + +MPI_Aint win_host_master_size = 0; +MPI_Aint win_host__size = 0; + +MPI_Aint win_host_master_size; +MPI_Win win_host_master; +int win_host_master_disp; +void *win_host_master_ptr; + + +int build_numa_mapping( int, int, MPI_Comm *, map_t *); +int map_hostnames( MPI_Comm *, int, int, map_t *); +int get_cpu_id( void ); +int compare_string_int_int( const void *, const void * ); + + +int init_numa( int Rank, int Size, MPI_Comm *MYWORLD, map_t *Me ) +{ + + // build up the numa hierarchy + // + build_numa_mapping( rank, size, MYWORLD, Me ); + + // initialize sizes for the persistent + // shared windos + + win_host_master_size = WIN_HOST_MASTER_SIZE_DFLT*1024*1024; + MPI_Aint win_host_size = WIN_HOST_SIZE_DFLT*1024*1024; + + + // initialize the persistent shared windows + // + int SHMEMl = Me->SHMEMl; + MPI_Info winfo; + MPI_Info_create(&winfo); + MPI_Info_set(winfo, "alloc_shared_noncontig", "true"); + + Me->win.size = win_host_size; + MPI_Win_allocate_shared(Me->win.size, 1, winfo, *Me->COMM[SHMEMl], &(Me->win.ptr), &(Me->win.win)); + + MPI_Aint size = ( Me->Rank[SHMEMl] == 0 ? win_host_master_size : 0); + MPI_Win_allocate_shared(size, 1, winfo, *Me->COMM[SHMEMl], &win_host_master_ptr, &win_host_master); + + Me->swins = (win_t*)malloc(Me->Ntasks[SHMEMl]*sizeof(win_t) ); +// Me->swins = (win_t*)malloc(Me->Ntasks[SHMEMl]*sizeof(win_t)); + // get the addresses of all the windows from my siblings + // at my shared-memory level + // + for( int t = 0; t < Me->Ntasks[SHMEMl]; t++ ) + if( t != Me->Rank[SHMEMl] ) + MPI_Win_shared_query( Me->win.win, t, &(Me->swins[t].size), &(Me->swins[t].disp), &(Me->swins[t].ptr) ); + + if( Me->Rank[SHMEMl] != 0 ) + MPI_Win_shared_query( win_host_master, 0, &(win_host_master_size), &win_host_master_disp, &win_host_master_ptr ); + +} + + +int shutdown_numa( int Rank, int Size, MPI_Comm *MYWORLD, map_t *Me ) +{ + // free every shared memory and window + // + MPI_Win_free(&(Me->win.win)); + + // free all the structures if needed + // + free(Me->Ranks_to_host); + free(Me->swins); + + // anything else + // + // ... + +} + +int build_numa_mapping( int Rank, int Size, MPI_Comm *MYWORLD, map_t *Me ) +{ + COMM[WORLD] = *MYWORLD; + + Me->Ntasks[WORLD] = Size; + Me->Rank[WORLD] = Rank; + Me->COMM[WORLD] = &COMM[WORLD]; + + Me->mycpu = get_cpu_id(); + + // --- find how many hosts we are running on; + // that is needed to build the communicator + // among the masters of each host + // + map_hostnames( &COMM[WORLD], Rank, Size, Me ); + + + Me->MAXl = ( Me->Nhosts > 1 ? HOSTS : myHOST ); + + // --- create the communicator for each host + // + MPI_Comm_split( COMM[WORLD], Me->myhost, Me->Rank[WORLD], &COMM[myHOST]); + MPI_Comm_size( COMM[myHOST], &Size ); + MPI_Comm_rank( COMM[myHOST], &Rank ); + + Me->COMM[myHOST] = &COMM[myHOST]; + Me->Rank[myHOST] = Rank; + Me->Ntasks[myHOST] = Size; + + // --- create the communicator for the + // masters of each host + // + int Im_host_master = ( Me->Rank[myHOST] == 0 ); + MPI_Comm_split( COMM[WORLD], Im_host_master, Me->Rank[WORLD], &COMM[HOSTS]); + // + // NOTE: by default, the Rank 0 in WORLD is also Rank 0 in HOSTS + // + if (Im_host_master) + { + Me->COMM[HOSTS] = &COMM[HOSTS]; + Me->Ntasks[HOSTS] = Me->Nhosts; + MPI_Comm_rank( COMM[HOSTS], &(Me->Rank[HOSTS])); + } + else + { + Me->COMM[HOSTS] = NULL; + Me->Ntasks[HOSTS] = 0; + Me->Rank[HOSTS] = -1; + } + + // --- create the communicator for the + // numa node + // + MPI_Comm_split_type( COMM[myHOST], MPI_COMM_TYPE_SHARED, Me->Rank[myHOST], MPI_INFO_NULL, &COMM[NUMA]); + Me->COMM[NUMA] = &COMM[NUMA]; + MPI_Comm_size( COMM[NUMA], &(Me->Ntasks[NUMA])); + MPI_Comm_rank( COMM[NUMA], &(Me->Rank[NUMA])); + + // check whether NUMA == myHOST and determine + // the maximum level of shared memory in the + // topology + // + if ( Me->Ntasks[NUMA] == Me->Ntasks[myHOST] ) + { + // collapse levels from NUMA to myHOST + // + Me->Ntasks[ISLAND] = Me->Ntasks[NUMA]; // equating to NUMA as we know the rank better via MPI_SHARED + Me->Rank[ISLAND] = Me->Rank[NUMA]; + Me->COMM[ISLAND] = Me->COMM[NUMA]; + + Me->Rank[myHOST] = Me->Rank[NUMA]; + Me->COMM[myHOST] = Me->COMM[NUMA]; + Me->SHMEMl = myHOST; + } + else + { + // actually we do not care for this case + // at this moment + printf(">>> It seems that rank %d belongs to a node for which " + " the node topology does not coincide \n", Rank ); + Me->SHMEMl = NUMA; + } + + int check_SHMEM_level = 1; + int globalcheck_SHMEM_level; + int globalmax_SHMEM_level; + MPI_Allreduce( &(Me->SHMEMl), &globalmax_SHMEM_level, 1, MPI_INT, MPI_MAX, *MYWORLD ); + + check_SHMEM_level = ( (Me->SHMEMl == myHOST) && (globalmax_SHMEM_level == Me->SHMEMl) ); + + MPI_Allreduce( &check_SHMEM_level, &globalcheck_SHMEM_level, 1, MPI_INT, MPI_MAX, *MYWORLD ); + + if( globalcheck_SHMEM_level < 1 ) + { + if( Rank == 0 ) { + printf("There was an error in determining the topology hierarchy, " + "SHMEM level is different for different MPI tasks\n"); + return -1; } + } + + return 0; +} + + +int map_hostnames( MPI_Comm *MY_WORLD, // the communicator to refer to + int Rank, // the initial rank of the calling process in MYWORLD + int Ntasks, // the number of tasks in MY_WORLD + map_t *me) // address of the info structure for the calling task + +{ + // -------------------------------------------------- + // --- init some global vars + me -> Ranks_to_host = (int*)malloc(Ntasks*sizeof(int)); + //me -> Ranks_to_host = (int*)malloc(Ntasks*sizeof(int)); + me -> Nhosts = 0; + me -> myhost = -1; + + // -------------------------------------------------- + // --- find how many hosts we are using + + + char myhostname[HOST_NAME_MAX+1]; + gethostname( myhostname, HOST_NAME_MAX+1 ); + + + // determine how much space to book for hostnames + int myhostlen = strlen(myhostname)+1; + int maxhostlen = 0; + MPI_Allreduce ( &myhostlen, &maxhostlen, 1, MPI_INT, MPI_MAX, *MY_WORLD ); + + // collect hostnames + // + typedef struct { + char hostname[maxhostlen]; + int rank; + } hostname_rank_t; + + hostname_rank_t mydata; + hostname_rank_t *alldata = (hostname_rank_t*)calloc( Ntasks, sizeof(hostname_rank_t) ); + + mydata.rank = Rank; + sprintf( mydata.hostname, "%s", myhostname); + + MPI_Allgather( &mydata, sizeof(hostname_rank_t), MPI_BYTE, alldata, sizeof(hostname_rank_t), MPI_BYTE, *MY_WORLD ); + + // sort the hostnames + // 1) set the lenght of string for comparison + int dummy = maxhostlen; + compare_string_int_int( NULL, &dummy ); + + + // 2) actually sort + qsort( alldata, Ntasks, sizeof(hostname_rank_t), compare_string_int_int ); + // now the array alldata is sorted by hostname, and inside each hostname the processes + // running on each host are sorted by their node, and for each node they are sorted + // by ht. + // As a direct consequence, the running index on the alldata array can be considered + // as the new global rank of each process + + // --- count how many diverse hosts we have, and register each rank to its host, so that + // we can alway find all the tasks with their original rank + + + char *prev = alldata[0].hostname; + for ( int R = 0; R < Ntasks; R++ ) + { + if ( strcmp(alldata[R].hostname, prev) != 0 ) { + me->Nhosts++; prev = alldata[R].hostname; } + + if ( alldata[R].rank == Rank ) // it's me + me->myhost = me->Nhosts; // remember my host + } + me->Nhosts++; + + // with the following gathering we build-up the mapping Ranks_to_hosts, so that + // we know which host each mpi rank (meaning the original rank) belongs to + // + + MPI_Allgather( &me->myhost, sizeof(me->myhost), MPI_BYTE, + me->Ranks_to_host, sizeof(me->myhost), MPI_BYTE, *MY_WORLD ); + + free( alldata ); + + return me->Nhosts; +} + + + +int compare_string_int_int( const void *A, const void *B ) +// used to sort structures made as +// { char *s; +// int b; +// ... } +// The sorting is hierarchical by *s first, then b +// if necessary +// The length of *s is set by calling +// compare_string_int_int( NULL, len ) +// before to use this routine in qsort-like calls +{ + static int str_len = 0; + if ( A == NULL ) + { + str_len = *(int*)B + 1; + return 0; + } + + // we do not use strncmp because str_len=0, + // i.e. using this function without initializing it, + // can be used to have a sorting only on + // strings + int order = strcmp( (char*)A, (char*)B ); + + if ( str_len && (!order) ) + { + int a = *(int*)((char*)A + str_len); + int b = *(int*)((char*)B + str_len); + order = a - b; + if( !order ) + { + int a = *((int*)((char*)A + str_len)+1); + int b = *((int*)((char*)B + str_len)+1); + order = a - b; + } + } + + return order; +} + + +#define CPU_ID_ENTRY_IN_PROCSTAT 39 + +int read_proc__self_stat( int, int * ); + +int get_cpu_id( void ) +{ +#if defined(_GNU_SOURCE) // GNU SOURCE ------------ + + return sched_getcpu( ); + +#else + +#ifdef SYS_getcpu // direct sys call --- + + int cpuid; + if ( syscall( SYS_getcpu, &cpuid, NULL, NULL ) == -1 ) + return -1; + else + return cpuid; + +#else + + unsigned val; + if ( read_proc__self_stat( CPU_ID_ENTRY_IN_PROCSTAT, &val ) == -1 ) + return -1; + + return (int)val; + +#endif // ----------------------- +#endif + +} + + + +int read_proc__self_stat( int field, int *ret_val ) +/* + Other interesting fields: + + pid : 0 + father : 1 + utime : 13 + cutime : 14 + nthreads : 18 + rss : 22 + cpuid : 39 + + read man /proc page for fully detailed infos + */ +{ + // not used, just mnemonic + // char *table[ 52 ] = { [0]="pid", [1]="father", [13]="utime", [14]="cutime", [18]="nthreads", [22]="rss", [38]="cpuid"}; + + *ret_val = 0; + + FILE *file = fopen( "/proc/self/stat", "r" ); + if (file == NULL ) + return -1; + + char *line = NULL; + int ret; + size_t len; + ret = getline( &line, &len, file ); + fclose(file); + + if( ret == -1 ) + return -1; + + char *savetoken = line; + char *token = strtok_r( line, " ", &savetoken); + --field; + do { token = strtok_r( NULL, " ", &savetoken); field--; } while( field ); + + *ret_val = atoi(token); + + free(line); + + return 0; +} diff --git a/numa.h b/numa.h new file mode 100644 index 0000000..985d872 --- /dev/null +++ b/numa.h @@ -0,0 +1,52 @@ + +#define NUMA 0 // my NUMA node communicator, includes all the sibling tasks that share memory +#define ISLAND 1 // something between the host and the NUMA nodes, if present +#define myHOST 2 // my host communicator, includes all the sibling tasks running on the same hosts +#define HOSTS 3 // the communicator that includes only the masters of the hosts +#define WORLD 4 // everybody is in (i.e. this is MPI_COMM_WORLD) +#define HLEVELS 5 + +extern char *LEVEL_NAMES[HLEVELS]; + +typedef struct +{ + MPI_Win win; + MPI_Aint size; + void *ptr; + int disp; +} win_t; + +typedef struct +{ + int mycpu; // the core (hwthread) on which i'm running + int nthreads; // how many (omp) thread do i have + int myhost; // the host on which i'm running + int Nhosts; + int Ntasks[HLEVELS]; + int *Ranks_to_host; // check if it is needed + int Rank[HLEVELS]; + int MAXl; // the maximum level of the hierarchy + int SHMEMl; // the maximum hierarchy level that is in shared memory + MPI_Comm *COMM[HLEVELS]; + // ----------------------- + // not yet used + // int mynode; // the numa node on which i'm running + // int ntasks_in_my_node; + win_t win; // my shared-memory window + win_t *swins; // the shared-memory windows of ther tasks in my host +} map_t; + + + +extern map_t Me; +extern MPI_Comm COMM[HLEVELS]; + + +#define WIN_HOST_SIZE_DFLT 100 // in MB +#define WIN_HOST_MASTER_SIZE_DFLT 100 // in MB + +extern MPI_Aint win_host_master_size; +extern MPI_Win win_host_master; +extern int win_host_master_disp; +extern void *win_host_master_ptr; + -- GitLab