root/opal/mca/btl/usnic/btl_usnic_hwloc.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. get_distance_matrix
  2. find_numa_node
  3. find_my_numa_node
  4. find_device_numa
  5. opal_btl_usnic_hwloc_distance

   1 /*
   2  * Copyright (c) 2013-2019 Cisco Systems, Inc.  All rights reserved
   3  * Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
   4  * $COPYRIGHT$
   5  *
   6  * Additional copyrights may follow
   7  *
   8  * $HEADER$
   9  */
  10 
  11 #include "opal_config.h"
  12 
  13 #include "opal/mca/hwloc/base/base.h"
  14 #include "opal/constants.h"
  15 
  16 #include "opal/mca/btl/base/base.h"
  17 
  18 #include "btl_usnic_hwloc.h"
  19 
  20 /*
  21  * Local variables
  22  */
  23 static hwloc_obj_t my_numa_node = NULL;
  24 static int num_numa_nodes = 0;
  25 static struct hwloc_distances_s *matrix = NULL;
  26 #if HWLOC_API_VERSION >= 0x20000
  27 static unsigned int matrix_nr = 1;
  28 #endif
  29 
  30 /*
  31  * Get the hwloc distance matrix (if we don't already have it).
  32  */
  33 static int get_distance_matrix(void)
  34 {
  35 #if HWLOC_API_VERSION < 0x20000
  36     /* Note that the matrix data structure belongs to hwloc; we are not
  37      * responsible for freeing it. */
  38 
  39     if (NULL == matrix) {
  40         matrix = hwloc_get_whole_distance_matrix_by_type(opal_hwloc_topology,
  41                                                          HWLOC_OBJ_NODE);
  42     }
  43 
  44     return (NULL == matrix) ? OPAL_ERROR : OPAL_SUCCESS;
  45 #else
  46     if (0 != hwloc_distances_get_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE,
  47                                          &matrix_nr, &matrix,
  48                                          HWLOC_DISTANCES_KIND_MEANS_LATENCY, 0) || 0 == matrix_nr) {
  49         return OPAL_ERROR;
  50     }
  51     return OPAL_SUCCESS;
  52 #endif
  53 }
  54 
  55 /*
  56  * Find the NUMA node that covers a given cpuset
  57  */
  58 static hwloc_obj_t find_numa_node(hwloc_bitmap_t cpuset)
  59 {
  60     hwloc_obj_t obj;
  61 
  62     obj =
  63         hwloc_get_first_largest_obj_inside_cpuset(opal_hwloc_topology, cpuset);
  64 
  65     /* Go upwards until we hit the NUMA node or run out of parents */
  66     while (obj->type > HWLOC_OBJ_NODE &&
  67            NULL != obj->parent) {
  68         obj = obj->parent;
  69     }
  70 
  71     /* Make sure we ended up on the NUMA node */
  72     if (obj->type != HWLOC_OBJ_NODE) {
  73         opal_output_verbose(5, USNIC_OUT,
  74                             "btl:usnic:filter_numa: could not find NUMA node where this process is bound; filtering by NUMA distance not possible");
  75         return NULL;
  76     }
  77 
  78     /* Finally, make sure that our cpuset doesn't span more than 1
  79        NUMA node */
  80     if (hwloc_get_nbobjs_inside_cpuset_by_type(opal_hwloc_topology,
  81                                                cpuset, HWLOC_OBJ_NODE) > 1) {
  82         opal_output_verbose(5, USNIC_OUT,
  83                             "btl:usnic:filter_numa: this process is bound to more than 1 NUMA node; filtering by NUMA distance not possible");
  84         return NULL;
  85     }
  86 
  87     return obj;
  88 }
  89 
  90 /*
  91  * Find my NUMA node in the hwloc topology.  This is a Cisco
  92  * UCS-specific BTL, so I know that I'll always have a NUMA node
  93  * (i.e., not some unknown server type that may not have or report a
  94  * NUMA node).
  95  *
  96  * Note that the my_numa_node value we find is just a handle; we
  97  * aren't responsible for freeing it.
  98  */
  99 static int find_my_numa_node(void)
 100 {
 101     hwloc_obj_t obj;
 102     hwloc_bitmap_t cpuset;
 103 
 104     if (NULL != my_numa_node) {
 105         return OPAL_SUCCESS;
 106     }
 107 
 108     /* Get this process' binding */
 109     cpuset = hwloc_bitmap_alloc();
 110     if (NULL == cpuset) {
 111         return OPAL_ERR_OUT_OF_RESOURCE;
 112     }
 113     if (0 != hwloc_get_cpubind(opal_hwloc_topology, cpuset, 0)) {
 114         hwloc_bitmap_free(cpuset);
 115         return OPAL_ERR_NOT_AVAILABLE;
 116     }
 117 
 118     /* Get the largest object type in the cpuset */
 119     obj = find_numa_node(cpuset);
 120     hwloc_bitmap_free(cpuset);
 121     if (NULL == obj) {
 122         return OPAL_ERR_NOT_AVAILABLE;
 123     }
 124 
 125     /* Happiness */
 126     my_numa_node = obj;
 127     num_numa_nodes = hwloc_get_nbobjs_by_type(opal_hwloc_topology,
 128                                               HWLOC_OBJ_NODE);
 129     return OPAL_SUCCESS;
 130 
 131 }
 132 
 133 /*
 134  * Find a NUMA node covering the device associated with this module
 135  */
 136 static hwloc_obj_t find_device_numa(opal_btl_usnic_module_t *module)
 137 {
 138     struct fi_usnic_info *uip;
 139     hwloc_obj_t obj;
 140 
 141     /* Bozo checks */
 142     assert(NULL != matrix);
 143     assert(NULL != my_numa_node);
 144 
 145     uip = &module->usnic_info;
 146 
 147     /* Look for the IP device name in the hwloc topology (the usnic
 148        device is simply an alternate API to reach the same device, so
 149        if we find the IP device name, we've found the usNIC device) */
 150     obj = NULL;
 151     while (NULL != (obj = hwloc_get_next_osdev(opal_hwloc_topology, obj))) {
 152         assert(HWLOC_OBJ_OS_DEVICE == obj->type);
 153         if (0 == strcmp(obj->name, uip->ui.v1.ui_ifname)) {
 154             break;
 155         }
 156     }
 157 
 158     /* Did not find it */
 159     if (NULL == obj) {
 160         return NULL;
 161     }
 162 
 163     /* Search upwards to find the device's NUMA node */
 164     /* Go upwards until we hit the NUMA node or run out of parents */
 165     while (obj->type > HWLOC_OBJ_NODE &&
 166            NULL != obj->parent) {
 167         obj = obj->parent;
 168     }
 169 
 170     /* Make sure we ended up on the NUMA node */
 171     if (obj->type != HWLOC_OBJ_NODE) {
 172         opal_output_verbose(5, USNIC_OUT,
 173                             "btl:usnic:filter_numa: could not find NUMA node for %s; filtering by NUMA distance not possible",
 174                             module->linux_device_name);
 175         return NULL;
 176     }
 177 
 178     return obj;
 179 }
 180 
 181 /*
 182  * Public entry point: find the hwloc NUMA distance from this process
 183  * to the usnic device in the specified module.
 184  */
 185 int opal_btl_usnic_hwloc_distance(opal_btl_usnic_module_t *module)
 186 {
 187     int ret;
 188     hwloc_obj_t dev_numa;
 189 
 190     /* Bozo check */
 191     assert(NULL != module);
 192 
 193     /* Is this process bound? */
 194     if (!proc_bound()) {
 195         opal_output_verbose(5, USNIC_OUT,
 196                             "btl:usnic:filter_numa: not sorting devices by NUMA distance (process not bound)");
 197         return OPAL_SUCCESS;
 198     }
 199 
 200     opal_output_verbose(5, USNIC_OUT,
 201                         "btl:usnic:filter_numa: filtering devices by NUMA distance");
 202 
 203     /* ensure we have the topology */
 204     if (OPAL_SUCCESS !=- opal_hwloc_base_get_topology()) {
 205         opal_output_verbose(5, USNIC_OUT,
 206                             "btl:usnic:filter_numa: not sorting devices by NUMA distance (topology not available)");
 207         return OPAL_SUCCESS;
 208     }
 209 
 210     /* Get the hwloc distance matrix for all NUMA nodes */
 211     if (OPAL_SUCCESS != (ret = get_distance_matrix())) {
 212         return ret;
 213     }
 214 
 215     /* Find my NUMA node */
 216     if (OPAL_SUCCESS != (ret = find_my_numa_node())) {
 217         return ret;
 218     }
 219     /* If my_numa_node is still NULL, that means we span more than 1
 220        NUMA node.  So... no sorting/pruning for you! */
 221     if (NULL == my_numa_node) {
 222         return OPAL_SUCCESS;
 223     }
 224 
 225     /* Find the NUMA node covering this module's device */
 226     dev_numa = find_device_numa(module);
 227 
 228     /* Lookup the distance between my NUMA node and the NUMA node of
 229        the device */
 230 #if HWLOC_API_VERSION < 0x20000
 231     if (NULL != dev_numa) {
 232         module->numa_distance =
 233             matrix->latency[dev_numa->logical_index * num_numa_nodes +
 234                             my_numa_node->logical_index];
 235 
 236         opal_output_verbose(5, USNIC_OUT,
 237                             "btl:usnic:filter_numa: %s is distance %d from me",
 238                             module->linux_device_name,
 239                             module->numa_distance);
 240     }
 241 #else
 242     if (NULL != dev_numa) {
 243         int myindex, devindex;
 244         unsigned int j;
 245         myindex = -1;
 246         for (j=0; j < matrix_nr; j++) {
 247             if (my_numa_node == matrix->objs[j]) {
 248                 myindex = j;
 249                 break;
 250             }
 251         }
 252         if (-1 == myindex) {
 253             return OPAL_SUCCESS;
 254         }
 255         devindex = -1;
 256         for (j=0; j < matrix_nr; j++) {
 257             if (dev_numa == matrix->objs[j]) {
 258                 devindex = j;
 259                 break;
 260             }
 261         }
 262         if (-1 == devindex) {
 263             return OPAL_SUCCESS;
 264         }
 265 
 266         module->numa_distance =
 267             matrix->values[(devindex * num_numa_nodes) + myindex];
 268 
 269         opal_output_verbose(5, USNIC_OUT,
 270                             "btl:usnic:filter_numa: %s is distance %d from me",
 271                             module->linux_device_name,
 272                             module->numa_distance);
 273     }
 274 #endif
 275 
 276     return OPAL_SUCCESS;
 277 }

/* [<][>][^][v][top][bottom][index][help] */