Skip to content
Snippets Groups Projects
Select Git revision
  • 1d70857794e2d022f8bd5817b12014de3fa2e8b8
  • main default protected
  • merge
  • split_input
  • revamp
  • test_derubeis
  • glacopo
  • inverse_imaging
  • openmp
  • RICKv2.0
  • RICKv1.0
  • r0.1c
  • end-of-re_structuring
13 results

numa.c

Blame
  • numa.c 12.05 KiB
    #include "allvars.h"
    #include "proto.h"
    #include <stdio.h>
    #include <unistd.h>
    #include <limits.h>
    
    
    map_t     Me;           
    MPI_Comm  COMM[HLEVELS];
    
    char *LEVEL_NAMES[HLEVELS] = {"NUMA", "ISLAND", "myHOST", "HOSTS", "WORLD"};
    
    MPI_Aint  win_host_master_size = 0;
    
    MPI_Aint    win_ctrl_hostmaster_size; 
    MPI_Win     win_ctrl_hostmaster;      
    int         win_ctrl_hostmaster_disp; 
    void       *win_ctrl_hostmaster_ptr;
    
    MPI_Aint    win_hostmaster_size;
    MPI_Win     win_hostmaster;
    int         win_hostmaster_disp;
    void       *win_hostmaster_ptr; 
    
    
    int numa_build_mapping( int, int, MPI_Comm *, map_t *);
    int numa_map_hostnames( MPI_Comm *, int, int, map_t *);
    int get_cpu_id( void );
    int compare_string_int_int( const void *, const void * );
    
    
    int numa_init( int Rank, int Size, MPI_Comm *MYWORLD, map_t *Me )
    {
    
      /* 
       * build up the numa hierarchy
       */
      numa_build_mapping( global_rank, size, MYWORLD, Me );
     
      /*
       * initialize the persistent shared windows
       */ 
    
      int SHMEMl = Me->SHMEMl;
      MPI_Info winfo;
      MPI_Info_create(&winfo);
      MPI_Info_set(winfo, "alloc_shared_noncontig", "true");
    
      // -----------------------------------
      // initialize the flow control windows
      // -----------------------------------
      Me->win_ctrl.size = sizeof(int);
      MPI_Win_allocate_shared(Me->win_ctrl.size, 1, winfo, *Me->COMM[SHMEMl],
    			  &(Me->win_ctrl.ptr), &(Me->win_ctrl.win));
    
      MPI_Aint wsize = sizeof(int);
      MPI_Win_allocate_shared(wsize, 1, winfo, *Me->COMM[SHMEMl],
    			  &win_ctrl_hostmaster_ptr, &win_ctrl_hostmaster);
      
      Me->scwins = (win_t*)malloc(Me->Ntasks[SHMEMl]*sizeof(win_t) );
      // get the addresses of all the windows from my siblings
      // at my shared-memory level
      //
      for( int t = 0; t < Me->Ntasks[SHMEMl]; t++ )
        {
          //if( t != Me->Rank[SHMEMl] )
    	MPI_Win_shared_query( Me->win_ctrl.win, t, &(Me->scwins[t].size),
    			      &(Me->scwins[t].disp), &(Me->scwins[t].ptr) );
        }
    
      if( Me->Rank[SHMEMl] != 0 )
        MPI_Win_shared_query( win_ctrl_hostmaster, 0, &(win_ctrl_hostmaster_size),
    			  &win_ctrl_hostmaster_disp, &win_ctrl_hostmaster_ptr );
    
    
      return 0;
    }
    
    
    int numa_allocate_shared_windows(  map_t *me, MPI_Aint size, MPI_Aint host_size )
    {
    
      int SHMEMl = me->SHMEMl;
      MPI_Info winfo;
    
      MPI_Info_create(&winfo);
      MPI_Info_set(winfo, "alloc_shared_noncontig", "true");
    
      // -----------------------------------
      // initialize the data windows
      // -----------------------------------
      MPI_Aint win_host_size;
      
      if( host_size == 0 )
        win_hostmaster_size = WIN_HOST_MASTER_SIZE_DFLT*1024*1024;
      else
        win_hostmaster_size = host_size;
    
      if( size == 0 )
        win_host_size  = WIN_HOST_SIZE_DFLT*1024*1024;
      else
        win_host_size  = size;
    
    
      me->win.size = win_host_size;
      MPI_Win_allocate_shared(me->win.size, 1, winfo, *me->COMM[SHMEMl], &(me->win.ptr), &(me->win.win));
    
      MPI_Aint wsize = ( me->Rank[SHMEMl] == 0 ? win_hostmaster_size : 0);
      MPI_Win_allocate_shared(wsize, 1, winfo, *me->COMM[SHMEMl], &win_hostmaster_ptr, &win_hostmaster);
      
      me->swins = (win_t*)malloc(me->Ntasks[SHMEMl]*sizeof(win_t) );
      me->swins[me->Rank[SHMEMl]] = me->win;
      //  me->swins = (win_t*)malloc(me->Ntasks[SHMEMl]*sizeof(win_t));
      // get the addresses of all the windows from my siblings
      // at my shared-memory level
      //
      for( int t = 0; t < me->Ntasks[SHMEMl]; t++ )
        if( t != me->Rank[SHMEMl] )
    	MPI_Win_shared_query( me->win.win, t, &(me->swins[t].size), &(me->swins[t].disp), &(me->swins[t].ptr) );
    
      if( me->Rank[SHMEMl] != 0 )
        MPI_Win_shared_query( win_hostmaster, 0, &(win_hostmaster_size), &win_hostmaster_disp, &win_hostmaster_ptr );
    
      return 0;
    }
    
    int numa_shutdown( int Rank, int Size, MPI_Comm *MYWORLD, map_t *me )
    {
      // free every shared memory and window
      //
      MPI_Win_free(&(me->win.win));
    
      // free all the structures if needed
      //
      free(me->Ranks_to_host);
      free(me->swins);
    
      // anything else
      //
      // ...
    
      return 0;
      
    }
    
    int numa_build_mapping( int Rank, int Size, MPI_Comm *MYWORLD, map_t *me )
    {
      COMM[WORLD] = *MYWORLD;
      
      me->Ntasks[WORLD] = Size;
      me->Rank[WORLD]   = Rank;
      me->COMM[WORLD]   = &COMM[WORLD];
    
      me->mycpu = get_cpu_id();
    
      // --- find how many hosts we are running on;
      //     that is needed to build the communicator
      //     among the masters of each host
      //
      numa_map_hostnames( &COMM[WORLD], Rank, Size, me );
    
    
      me->MAXl = ( me->Nhosts > 1 ? HOSTS : myHOST );
    
      // --- create the communicator for each host
      //
      MPI_Comm_split( COMM[WORLD], me->myhost, me->Rank[WORLD], &COMM[myHOST]);
      MPI_Comm_size( COMM[myHOST], &Size );
      MPI_Comm_rank( COMM[myHOST], &Rank );
      
      me->COMM[myHOST] = &COMM[myHOST];
      me->Rank[myHOST]   = Rank;
      me->Ntasks[myHOST] = Size;
    
      // with the following gathering we build-up the mapping Ranks_to_hosts, so that
      // we know which host each mpi rank (meaning the original rank) belongs to
      //
      
      MPI_Allgather( &me->myhost, sizeof(me->myhost), MPI_BYTE,
    		 me->Ranks_to_host, sizeof(me->myhost), MPI_BYTE, COMM[WORLD] );
    
      me -> Ranks_to_myhost = (int*)malloc(me->Ntasks[myHOST]*sizeof(int));
      MPI_Allgather( &global_rank, sizeof(global_rank), MPI_BYTE,
    		 me->Ranks_to_myhost, sizeof(global_rank), MPI_BYTE, *me->COMM[myHOST]);
      
    
    
      // --- create the communicator for the
      //     masters of each host
      //
      int Im_host_master = ( me->Rank[myHOST] == 0 );
      MPI_Comm_split( COMM[WORLD], Im_host_master, me->Rank[WORLD], &COMM[HOSTS]);
      //
      // NOTE: by default, the Rank 0 in WORLD is also Rank 0 in HOSTS
      //
      if (Im_host_master)
      { 
        me->COMM[HOSTS] = &COMM[HOSTS];
        me->Ntasks[HOSTS] = me->Nhosts;
        MPI_Comm_rank( COMM[HOSTS], &(me->Rank[HOSTS]));
      }
      else 
      {
        me->COMM[HOSTS]  = NULL;
        me->Ntasks[HOSTS]  = 0;
        me->Rank[HOSTS]    = -1;
      }
      
      // --- create the communicator for the
      //     numa node
      //
      MPI_Comm_split_type( COMM[myHOST], MPI_COMM_TYPE_SHARED, me->Rank[myHOST], MPI_INFO_NULL, &COMM[NUMA]);
      me->COMM[NUMA] = &COMM[NUMA];
      MPI_Comm_size( COMM[NUMA], &(me->Ntasks[NUMA]));
      MPI_Comm_rank( COMM[NUMA], &(me->Rank[NUMA]));
      
      // check whether NUMA == myHOST and determine
      // the maximum level of shared memory in the
      // topology
      //
      if ( me->Ntasks[NUMA] == me->Ntasks[myHOST] )
        {
          // collapse levels from NUMA to myHOST
          //
          me->Ntasks[ISLAND] = me->Ntasks[NUMA];  // equating to NUMA as we know the rank better via MPI_SHARED
          me->Rank[ISLAND]   = me->Rank[NUMA];
          me->COMM[ISLAND]   = me->COMM[NUMA];
          
          me->Rank[myHOST]   = me->Rank[NUMA];
          me->COMM[myHOST]   = me->COMM[NUMA];
          me->SHMEMl         = myHOST;
        }
      else
        {
          // actually we do not care for this case
          // at this moment
          printf(">>> It seems that rank %d belongs to a node for which "
    	     "    the node topology does not coincide \n", Rank );
          me->SHMEMl = NUMA;
        }
    
      int check_SHMEM_level = 1;
      int globalcheck_SHMEM_level;
      int globalmax_SHMEM_level;
      MPI_Allreduce( &(me->SHMEMl), &globalmax_SHMEM_level, 1, MPI_INT, MPI_MAX, *MYWORLD );
    
      check_SHMEM_level = ( (me->SHMEMl == myHOST) && (globalmax_SHMEM_level == me->SHMEMl) );
      
      MPI_Allreduce( &check_SHMEM_level, &globalcheck_SHMEM_level, 1, MPI_INT, MPI_MAX, *MYWORLD );
      
      if( globalcheck_SHMEM_level < 1 )
        {
          if( Rank == 0 ) {
    	printf("There was an error in determining the topology hierarchy, "
    	       "SHMEM level is different for different MPI tasks\n");
    	return -1; }
        }  
      
      return 0;  
    }
    
    
    int numa_map_hostnames( MPI_Comm *MY_WORLD,   // the communicator to refer to
    			int Rank,              // the initial rank of the calling process in MYWORLD
    			int Ntasks,            // the number of tasks in MY_WORLD
    			map_t *me)             // address of the info structure for the calling task
    
    {
      // --------------------------------------------------
      // --- init some global vars
      me -> Ranks_to_host = (int*)malloc(Ntasks*sizeof(int));
      me -> Nhosts = 0;
      me -> myhost = -1;
    
      // --------------------------------------------------
      // --- find how many hosts we are using
      
    
      char myhostname[HOST_NAME_MAX+1];
      gethostname( myhostname, HOST_NAME_MAX+1 );
    
    
      // determine how much space to book for hostnames
      int myhostlen = strlen(myhostname)+1;
      int maxhostlen = 0;
      MPI_Allreduce ( &myhostlen, &maxhostlen, 1, MPI_INT, MPI_MAX, *MY_WORLD );
    
      // collect hostnames
      //
      typedef struct {
        char hostname[maxhostlen];
        int rank;
      } hostname_rank_t;
          
      hostname_rank_t mydata;
      hostname_rank_t *alldata = (hostname_rank_t*)calloc( Ntasks, sizeof(hostname_rank_t) );
    
      mydata.rank = Rank;  
      sprintf( mydata.hostname, "%s", myhostname);
      
      MPI_Allgather( &mydata, sizeof(hostname_rank_t), MPI_BYTE, alldata, sizeof(hostname_rank_t), MPI_BYTE, *MY_WORLD );
       
      // sort the hostnames
      //       1) set the lenght of string for comparison
      int dummy = maxhostlen;
      compare_string_int_int( NULL, &dummy );
    
    
      //       2) actually sort
      qsort( alldata, Ntasks, sizeof(hostname_rank_t), compare_string_int_int );
      // now the array alldata is sorted by hostname, and inside each hostname the processes
      // running on each host are sorted by their node, and for each node they are sorted
      // by ht.
      // As a direct consequence, the running index on the alldata array can be considered
      // as the new global rank of each process
      
      // --- count how many diverse hosts we have, and register each rank to its host, so that
      //      we can alway find all the tasks with their original rank
    
          
      char *prev = alldata[0].hostname;
      for ( int R = 0; R < Ntasks; R++ )
      {	
        if ( strcmp(alldata[R].hostname, prev) != 0 ) {      
          me->Nhosts++; prev = alldata[R].hostname; }
    
        if ( alldata[R].rank == Rank )        // it's me
          me->myhost = me->Nhosts;            // remember my host
      }
      me->Nhosts++;
    
      free( alldata );
    
      return me->Nhosts;
    }
    
    
    
    int compare_string_int_int( const void *A, const void *B )
    // used to sort structures made as
    // { char *s;
    //   int b;
    //   ... }
    // The sorting is hierarchical by *s first, then b
    //   if necessary
    // The length of *s is set by calling
    //   compare_string_int_int( NULL, len )
    // before to use this routine in qsort-like calls
    {
      static int str_len = 0;
      if ( A == NULL )
        {
          str_len = *(int*)B + 1;
          return 0;
        }
    
      // we do not use strncmp because str_len=0,
      // i.e. using this function without initializing it,
      // can be used to have a sorting only on
      // strings
      int order = strcmp( (char*)A, (char*)B );
      
      if ( str_len && (!order) )
        {
          int a = *(int*)((char*)A + str_len);
          int b = *(int*)((char*)B + str_len);
          order = a - b;
          if( !order )
    	{
    	  int a = *((int*)((char*)A + str_len)+1);
    	  int b = *((int*)((char*)B + str_len)+1);
    	  order = a - b;
    	}
        }
      
      return order;
    }
    
    
    #define CPU_ID_ENTRY_IN_PROCSTAT 39
    
    int read_proc__self_stat( int, int * );
    
    int get_cpu_id( void )
    {
    #if defined(_GNU_SOURCE)                              // GNU SOURCE ------------
      
      return  sched_getcpu( );
    
    #else
    
    #ifdef SYS_getcpu                                     //     direct sys call ---
      
      int cpuid;
      if ( syscall( SYS_getcpu, &cpuid, NULL, NULL ) == -1 )
        return -1;
      else
        return cpuid;
      
    #else      
    
      int val;
      if ( read_proc__self_stat( CPU_ID_ENTRY_IN_PROCSTAT, &val ) == -1 )
        return -1;
    
      return (int)val;
    
    #endif                                                // -----------------------
    #endif
    
    }
    
    
    
    int read_proc__self_stat( int field, int *ret_val )
    /*
      Other interesting fields:
    
      pid      : 0
      father   : 1
      utime    : 13
      cutime   : 14
      nthreads : 18
      rss      : 22
      cpuid    : 39
    
      read man /proc page for fully detailed infos
     */
    {
      // not used, just mnemonic
      // char *table[ 52 ] = { [0]="pid", [1]="father", [13]="utime", [14]="cutime", [18]="nthreads", [22]="rss", [38]="cpuid"};
    
      *ret_val = 0;
    
      FILE *file = fopen( "/proc/self/stat", "r" );
      if (file == NULL )
        return -1;
    
      char   *line = NULL;
      int     ret;
      size_t  len;
      ret = getline( &line, &len, file );
      fclose(file);
    
      if( ret == -1 )
        return -1;
    
      char *savetoken = line;
      char *token = strtok_r( line, " ", &savetoken);
      --field;
      do { token = strtok_r( NULL, " ", &savetoken); field--; } while( field );
    
      *ret_val = atoi(token);
    
      free(line);
    
      return 0;
    }