root/orte/mca/rmaps/base/rmaps_base_binding.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. reset_usage
  2. unbind_procs
  3. bind_generic
  4. bind_in_place
  5. bind_to_cpuset
  6. orte_rmaps_base_compute_bindings

   1 /*
   2  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2011 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2011-2014 Cisco Systems, Inc.  All rights reserved.
  13  * Copyright (c) 2011-2012 Los Alamos National Security, LLC.
  14  *                         All rights reserved.
  15  * Copyright (c) 2013-2018 Intel, Inc.  All rights reserved.
  16  * Copyright (c) 2015-2017 Research Organization for Information Science
  17  *                         and Technology (RIST). All rights reserved.
  18  * Copyright (c) 2018      Inria.  All rights reserved.
  19  * Copyright (c) 2019 IBM Corporation. All rights reserved.
  20  * $COPYRIGHT$
  21  *
  22  * Additional copyrights may follow
  23  *
  24  * $HEADER$
  25  */
  26 
  27 #include "orte_config.h"
  28 #include "orte/constants.h"
  29 
  30 #include <sys/types.h>
  31 #ifdef HAVE_UNISTD_H
  32 #include <unistd.h>
  33 #endif  /* HAVE_UNISTD_H */
  34 #include <string.h>
  35 
  36 #include "opal/util/if.h"
  37 #include "opal/util/output.h"
  38 #include "orte/mca/mca.h"
  39 #include "opal/mca/base/base.h"
  40 #include "opal/mca/hwloc/base/base.h"
  41 #include "opal/threads/tsd.h"
  42 
  43 #include "orte/types.h"
  44 #include "orte/util/show_help.h"
  45 #include "orte/util/name_fns.h"
  46 #include "orte/runtime/orte_globals.h"
  47 #include "orte/util/hostfile/hostfile.h"
  48 #include "orte/util/dash_host/dash_host.h"
  49 #include "orte/mca/errmgr/errmgr.h"
  50 #include "orte/mca/ess/ess.h"
  51 #include "orte/runtime/data_type_support/orte_dt_support.h"
  52 
  53 #include "orte/mca/rmaps/base/rmaps_private.h"
  54 #include "orte/mca/rmaps/base/base.h"
  55 
  56 static bool membind_warned=false;
  57 
  58 static void reset_usage(orte_node_t *node, orte_jobid_t jobid)
  59 {
  60     int j;
  61     orte_proc_t *proc;
  62     opal_hwloc_obj_data_t *data=NULL;
  63     hwloc_obj_t bound;
  64 
  65     opal_output_verbose(10, orte_rmaps_base_framework.framework_output,
  66                         "%s reset_usage: node %s has %d procs on it",
  67                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
  68                         node->name, node->num_procs);
  69 
  70     /* start by clearing any existing info */
  71     opal_hwloc_base_clear_usage(node->topology->topo);
  72 
  73     /* cycle thru the procs on the node and record
  74      * their usage in the topology
  75      */
  76     for (j=0; j < node->procs->size; j++) {
  77         if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
  78             continue;
  79         }
  80         /* ignore procs from this job */
  81         if (proc->name.jobid == jobid) {
  82             opal_output_verbose(10, orte_rmaps_base_framework.framework_output,
  83                                 "%s reset_usage: ignoring proc %s",
  84                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
  85                                 ORTE_NAME_PRINT(&proc->name));
  86             continue;
  87         }
  88         bound = NULL;
  89         if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, (void**)&bound, OPAL_PTR) ||
  90             NULL == bound) {
  91             /* this proc isn't bound - ignore it */
  92             opal_output_verbose(10, orte_rmaps_base_framework.framework_output,
  93                                 "%s reset_usage: proc %s has no bind location",
  94                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
  95                                 ORTE_NAME_PRINT(&proc->name));
  96             continue;
  97         }
  98         data = (opal_hwloc_obj_data_t*)bound->userdata;
  99         if (NULL == data) {
 100             data = OBJ_NEW(opal_hwloc_obj_data_t);
 101             bound->userdata = data;
 102         }
 103         data->num_bound++;
 104         opal_output_verbose(10, orte_rmaps_base_framework.framework_output,
 105                             "%s reset_usage: proc %s is bound - total %d",
 106                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 107                             ORTE_NAME_PRINT(&proc->name), data->num_bound);
 108     }
 109 }
 110 
 111 static void unbind_procs(orte_job_t *jdata)
 112 {
 113     int j;
 114     orte_proc_t *proc;
 115 
 116     for (j=0; j < jdata->procs->size; j++) {
 117         if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
 118             continue;
 119         }
 120         orte_remove_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND);
 121         orte_remove_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP);
 122     }
 123 }
 124 
 125 static int bind_generic(orte_job_t *jdata,
 126                         orte_node_t *node,
 127                         int target_depth)
 128 {
 129     int j, rc;
 130     orte_job_map_t *map;
 131     orte_proc_t *proc;
 132     hwloc_obj_t trg_obj, tmp_obj, nxt_obj;
 133     unsigned int ncpus;
 134     opal_hwloc_obj_data_t *data;
 135     int total_cpus;
 136     hwloc_cpuset_t totalcpuset;
 137     hwloc_obj_t locale;
 138     char *cpu_bitmap;
 139     unsigned min_bound;
 140 
 141     opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 142                         "mca:rmaps: bind downward for job %s with bindings %s",
 143                         ORTE_JOBID_PRINT(jdata->jobid),
 144                         opal_hwloc_base_print_binding(jdata->map->binding));
 145     /* initialize */
 146     map = jdata->map;
 147     totalcpuset = hwloc_bitmap_alloc();
 148 
 149     /* cycle thru the procs */
 150     for (j=0; j < node->procs->size; j++) {
 151         if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
 152             continue;
 153         }
 154         /* ignore procs from other jobs */
 155         if (proc->name.jobid != jdata->jobid) {
 156             continue;
 157         }
 158         /* bozo check */
 159         locale = NULL;
 160         if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR) ||
 161             NULL == locale) {
 162             orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
 163             hwloc_bitmap_free(totalcpuset);
 164             return ORTE_ERR_SILENT;
 165         }
 166 
 167         /* use the min_bound object that intersects locale->cpuset at target_depth */
 168         tmp_obj = NULL;
 169         trg_obj = NULL;
 170         min_bound = UINT_MAX;
 171         while (NULL != (tmp_obj = hwloc_get_next_obj_by_depth(node->topology->topo, target_depth, tmp_obj))) {
 172             hwloc_obj_t root;
 173             opal_hwloc_topo_data_t *rdata;
 174             root = hwloc_get_root_obj(node->topology->topo);
 175             rdata = (opal_hwloc_topo_data_t*)root->userdata;
 176 
 177             if (!hwloc_bitmap_intersects(locale->cpuset, tmp_obj->cpuset))
 178                 continue;
 179 // From the old 3.x code trg_obj was picked via a call to
 180 // opal_hwloc_base_find_min_bound_target_under_obj() which
 181 // skiped over unavailable objects (via opal_hwloc_base_get_npus).
 182             if (rdata && rdata->available && !hwloc_bitmap_intersects(rdata->available, tmp_obj->cpuset))
 183                 continue;
 184 
 185             data = (opal_hwloc_obj_data_t*)tmp_obj->userdata;
 186             if (NULL == data) {
 187                 data = OBJ_NEW(opal_hwloc_obj_data_t);
 188                 tmp_obj->userdata = data;
 189             }
 190             if (data->num_bound < min_bound) {
 191                 min_bound = data->num_bound;
 192                 trg_obj = tmp_obj;
 193             }
 194         }
 195         if (NULL == trg_obj) {
 196             /* there aren't any such targets under this object */
 197             orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
 198             hwloc_bitmap_free(totalcpuset);
 199             return ORTE_ERR_SILENT;
 200         }
 201         /* record the location */
 202         orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, trg_obj, OPAL_PTR);
 203 
 204         /* start with a clean slate */
 205         hwloc_bitmap_zero(totalcpuset);
 206         total_cpus = 0;
 207         nxt_obj = trg_obj;
 208         do {
 209             if (NULL == nxt_obj) {
 210                 /* could not find enough cpus to meet request */
 211                 orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
 212                 hwloc_bitmap_free(totalcpuset);
 213                 return ORTE_ERR_SILENT;
 214             }
 215             trg_obj = nxt_obj;
 216             /* get the number of cpus under this location */
 217             ncpus = opal_hwloc_base_get_npus(node->topology->topo, trg_obj);
 218             opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 219                                 "%s GOT %d CPUS",
 220                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ncpus);
 221             /* track the number bound */
 222             if (NULL == (data = (opal_hwloc_obj_data_t*)trg_obj->userdata)) {
 223                 data = OBJ_NEW(opal_hwloc_obj_data_t);
 224                 trg_obj->userdata = data;
 225             }
 226             data->num_bound++;
 227             /* error out if adding a proc would cause overload and that wasn't allowed,
 228              * and it wasn't a default binding policy (i.e., the user requested it)
 229              */
 230             if (ncpus < data->num_bound &&
 231                 !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
 232                 if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
 233                     /* if the user specified a binding policy, then we cannot meet
 234                      * it since overload isn't allowed, so error out - have the
 235                      * message indicate that setting overload allowed will remove
 236                      * this restriction */
 237                     orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
 238                                    opal_hwloc_base_print_binding(map->binding), node->name,
 239                                    data->num_bound, ncpus);
 240                     hwloc_bitmap_free(totalcpuset);
 241                     return ORTE_ERR_SILENT;
 242                 } else {
 243                     /* if we have the default binding policy, then just don't bind */
 244                     OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE);
 245                     unbind_procs(jdata);
 246                     hwloc_bitmap_zero(totalcpuset);
 247                     return ORTE_SUCCESS;
 248                 }
 249             }
 250             /* bind the proc here */
 251             hwloc_bitmap_or(totalcpuset, totalcpuset, trg_obj->cpuset);
 252             /* track total #cpus */
 253             total_cpus += ncpus;
 254             /* move to the next location, in case we need it */
 255             nxt_obj = trg_obj->next_cousin;
 256         } while (total_cpus < orte_rmaps_base.cpus_per_rank);
 257         hwloc_bitmap_list_asprintf(&cpu_bitmap, totalcpuset);
 258         opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 259                             "%s PROC %s BITMAP %s",
 260                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 261                             ORTE_NAME_PRINT(&proc->name), cpu_bitmap);
 262         orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
 263         if (NULL != cpu_bitmap) {
 264             free(cpu_bitmap);
 265         }
 266         if (4 < opal_output_get_verbosity(orte_rmaps_base_framework.framework_output)) {
 267             char tmp1[1024], tmp2[1024];
 268             if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2str(tmp1, sizeof(tmp1),
 269                                                                node->topology->topo, totalcpuset)) {
 270                 opal_output(orte_rmaps_base_framework.framework_output,
 271                             "%s PROC %s ON %s IS NOT BOUND",
 272                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 273                             ORTE_NAME_PRINT(&proc->name), node->name);
 274             } else {
 275                 rc = opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), node->topology->topo, totalcpuset);
 276                 if (OPAL_SUCCESS != rc) {
 277                     ORTE_ERROR_LOG(rc);
 278                 }
 279                 opal_output(orte_rmaps_base_framework.framework_output,
 280                             "%s BOUND PROC %s[%s] TO %s: %s",
 281                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 282                             ORTE_NAME_PRINT(&proc->name), node->name,
 283                             tmp1, tmp2);
 284             }
 285         }
 286     }
 287     hwloc_bitmap_free(totalcpuset);
 288 
 289     return ORTE_SUCCESS;
 290 }
 291 
 292 static int bind_in_place(orte_job_t *jdata,
 293                          hwloc_obj_type_t target,
 294                          unsigned cache_level)
 295 {
 296     /* traverse the hwloc topology tree on each node downwards
 297      * until we find an unused object of type target - and then bind
 298      * the process to that target
 299      */
 300     int i, j;
 301     orte_job_map_t *map;
 302     orte_node_t *node;
 303     orte_proc_t *proc;
 304     unsigned int idx, ncpus;
 305     struct hwloc_topology_support *support;
 306     opal_hwloc_obj_data_t *data;
 307     hwloc_obj_t locale, sib;
 308     char *cpu_bitmap;
 309     bool found;
 310 
 311     opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 312                         "mca:rmaps: bind in place for job %s with bindings %s",
 313                         ORTE_JOBID_PRINT(jdata->jobid),
 314                         opal_hwloc_base_print_binding(jdata->map->binding));
 315     /* initialize */
 316     map = jdata->map;
 317 
 318     for (i=0; i < map->nodes->size; i++) {
 319         if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
 320             continue;
 321         }
 322         if (!orte_no_vm && (int)ORTE_PROC_MY_NAME->vpid != node->index) {
 323             continue;
 324         }
 325         if (!orte_do_not_launch) {
 326             /* if we don't want to launch, then we are just testing the system,
 327              * so ignore questions about support capabilities
 328              */
 329             support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology->topo);
 330             /* check if topology supports cpubind - have to be careful here
 331              * as Linux doesn't currently support thread-level binding. This
 332              * may change in the future, though, and it isn't clear how hwloc
 333              * interprets the current behavior. So check both flags to be sure.
 334              */
 335             if (!support->cpubind->set_thisproc_cpubind &&
 336                 !support->cpubind->set_thisthread_cpubind) {
 337                 if (!OPAL_BINDING_REQUIRED(map->binding) ||
 338                     !OPAL_BINDING_POLICY_IS_SET(map->binding)) {
 339                     /* we are not required to bind, so ignore this */
 340                     continue;
 341                 }
 342                 orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
 343                 return ORTE_ERR_SILENT;
 344             }
 345             /* check if topology supports membind - have to be careful here
 346              * as hwloc treats this differently than I (at least) would have
 347              * expected. Per hwloc, Linux memory binding is at the thread,
 348              * and not process, level. Thus, hwloc sets the "thisproc" flag
 349              * to "false" on all Linux systems, and uses the "thisthread" flag
 350              * to indicate binding capability - don't warn if the user didn't
 351              * specifically request binding
 352              */
 353             if (!support->membind->set_thisproc_membind &&
 354                 !support->membind->set_thisthread_membind &&
 355                 OPAL_BINDING_POLICY_IS_SET(map->binding)) {
 356                 if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
 357                     orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
 358                     membind_warned = true;
 359                 } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
 360                     orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
 361                     return ORTE_ERR_SILENT;
 362                 }
 363             }
 364         }
 365 
 366         /* some systems do not report cores, and so we can get a situation where our
 367          * default binding policy will fail for no necessary reason. So if we are
 368          * computing a binding due to our default policy, and no cores are found
 369          * on this node, just silently skip it - we will not bind
 370          */
 371         if (!OPAL_BINDING_POLICY_IS_SET(map->binding) &&
 372             HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology->topo, HWLOC_OBJ_CORE)) {
 373             opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 374                                 "Unable to bind-to core by default on node %s as no cores detected",
 375                                 node->name);
 376             continue;
 377         }
 378 
 379         /* we share topologies in order
 380          * to save space, so we need to reset the usage info to reflect
 381          * our own current state
 382          */
 383         reset_usage(node, jdata->jobid);
 384 
 385         /* cycle thru the procs */
 386         for (j=0; j < node->procs->size; j++) {
 387             if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
 388                 continue;
 389             }
 390             /* ignore procs from other jobs */
 391             if (proc->name.jobid != jdata->jobid) {
 392                 continue;
 393             }
 394             /* bozo check */
 395             locale = NULL;
 396             if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
 397                 orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
 398                 return ORTE_ERR_SILENT;
 399             }
 400             /* get the index of this location */
 401             if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology->topo, locale, OPAL_HWLOC_AVAILABLE))) {
 402                 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
 403                 return ORTE_ERR_SILENT;
 404             }
 405             /* get the number of cpus under this location */
 406             if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology->topo, locale))) {
 407                 orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
 408                 return ORTE_ERR_SILENT;
 409             }
 410             data = (opal_hwloc_obj_data_t*)locale->userdata;
 411             if (NULL == data) {
 412                 data = OBJ_NEW(opal_hwloc_obj_data_t);
 413                 locale->userdata = data;
 414             }
 415             /* if we don't have enough cpus to support this additional proc, try
 416              * shifting the location to a cousin that can support it - the important
 417              * thing is that we maintain the same level in the topology */
 418             if (ncpus < (data->num_bound+1)) {
 419                 opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 420                                     "%s bind_in_place: searching right",
 421                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
 422                 sib = locale;
 423                 found = false;
 424                 while (NULL != (sib = sib->next_cousin)) {
 425                     ncpus = opal_hwloc_base_get_npus(node->topology->topo, sib);
 426                     data = (opal_hwloc_obj_data_t*)sib->userdata;
 427                     if (NULL == data) {
 428                         data = OBJ_NEW(opal_hwloc_obj_data_t);
 429                         sib->userdata = data;
 430                     }
 431                     if (data->num_bound < ncpus) {
 432                         found = true;
 433                         locale = sib;
 434                         break;
 435                     }
 436                 }
 437                 if (!found) {
 438                     /* try the other direction */
 439                     opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 440                                         "%s bind_in_place: searching left",
 441                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
 442                     sib = locale;
 443                     while (NULL != (sib = sib->prev_cousin)) {
 444                         ncpus = opal_hwloc_base_get_npus(node->topology->topo, sib);
 445                         data = (opal_hwloc_obj_data_t*)sib->userdata;
 446                         if (NULL == data) {
 447                             data = OBJ_NEW(opal_hwloc_obj_data_t);
 448                             sib->userdata = data;
 449                         }
 450                         if (data->num_bound < ncpus) {
 451                             found = true;
 452                             locale = sib;
 453                             break;
 454                         }
 455                     }
 456                 }
 457                 if (!found) {
 458                     /* no place to put this - see if overload is allowed */
 459                     if (!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
 460                         if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
 461                             /* if the user specified a binding policy, then we cannot meet
 462                              * it since overload isn't allowed, so error out - have the
 463                              * message indicate that setting overload allowed will remove
 464                              * this restriction */
 465                             orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
 466                                            opal_hwloc_base_print_binding(map->binding), node->name,
 467                                            data->num_bound, ncpus);
 468                             return ORTE_ERR_SILENT;
 469                         } else {
 470                             /* if we have the default binding policy, then just don't bind */
 471                             OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE);
 472                             unbind_procs(jdata);
 473                             return ORTE_SUCCESS;
 474                         }
 475                     }
 476                 }
 477             }
 478             /* track the number bound */
 479             data = (opal_hwloc_obj_data_t*)locale->userdata;  // just in case it changed
 480             if (NULL == data) {
 481                 data = OBJ_NEW(opal_hwloc_obj_data_t);
 482                 locale->userdata = data;
 483             }
 484             data->num_bound++;
 485             opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 486                                 "BINDING PROC %s TO %s NUMBER %u",
 487                                 ORTE_NAME_PRINT(&proc->name),
 488                                 hwloc_obj_type_string(locale->type), idx);
 489             /* bind the proc here */
 490             hwloc_bitmap_list_asprintf(&cpu_bitmap, locale->cpuset);
 491             orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
 492             /* update the location, in case it changed */
 493             orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, locale, OPAL_PTR);
 494             opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 495                                 "%s BOUND PROC %s TO %s[%s:%u] on node %s",
 496                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 497                                 ORTE_NAME_PRINT(&proc->name),
 498                                 cpu_bitmap, hwloc_obj_type_string(locale->type),
 499                                 idx, node->name);
 500             if (NULL != cpu_bitmap) {
 501                 free(cpu_bitmap);
 502             }
 503         }
 504     }
 505 
 506     return ORTE_SUCCESS;
 507 }
 508 
 509 static int bind_to_cpuset(orte_job_t *jdata)
 510 {
 511     /* bind each process to opal_hwloc_base_cpu_list */
 512     int i, j;
 513     orte_job_map_t *map;
 514     orte_node_t *node;
 515     orte_proc_t *proc;
 516     struct hwloc_topology_support *support;
 517     opal_hwloc_topo_data_t *sum;
 518     hwloc_obj_t root;
 519     char *cpu_bitmap;
 520     unsigned id;
 521     orte_local_rank_t lrank;
 522     hwloc_bitmap_t mycpuset, tset;
 523 
 524     opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 525                         "mca:rmaps: bind job %s to cpus %s",
 526                         ORTE_JOBID_PRINT(jdata->jobid),
 527                         opal_hwloc_base_cpu_list);
 528     /* initialize */
 529     map = jdata->map;
 530     mycpuset = hwloc_bitmap_alloc();
 531 
 532     for (i=0; i < map->nodes->size; i++) {
 533         if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
 534             continue;
 535         }
 536         if (!orte_no_vm && (int)ORTE_PROC_MY_NAME->vpid != node->index) {
 537             continue;
 538         }
 539         if (!orte_do_not_launch) {
 540             /* if we don't want to launch, then we are just testing the system,
 541              * so ignore questions about support capabilities
 542              */
 543             support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology->topo);
 544             /* check if topology supports cpubind - have to be careful here
 545              * as Linux doesn't currently support thread-level binding. This
 546              * may change in the future, though, and it isn't clear how hwloc
 547              * interprets the current behavior. So check both flags to be sure.
 548              */
 549             if (!support->cpubind->set_thisproc_cpubind &&
 550                 !support->cpubind->set_thisthread_cpubind) {
 551                 if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy)) {
 552                     /* we are not required to bind, so ignore this */
 553                     continue;
 554                 }
 555                 orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
 556                 hwloc_bitmap_free(mycpuset);
 557                 return ORTE_ERR_SILENT;
 558             }
 559             /* check if topology supports membind - have to be careful here
 560              * as hwloc treats this differently than I (at least) would have
 561              * expected. Per hwloc, Linux memory binding is at the thread,
 562              * and not process, level. Thus, hwloc sets the "thisproc" flag
 563              * to "false" on all Linux systems, and uses the "thisthread" flag
 564              * to indicate binding capability
 565              */
 566             if (!support->membind->set_thisproc_membind &&
 567                 !support->membind->set_thisthread_membind) {
 568                 if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
 569                     orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
 570                     membind_warned = true;
 571                 } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
 572                     orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
 573                     hwloc_bitmap_free(mycpuset);
 574                     return ORTE_ERR_SILENT;
 575                 }
 576             }
 577         }
 578         root = hwloc_get_root_obj(node->topology->topo);
 579         if (NULL == root->userdata) {
 580             /* something went wrong */
 581             ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 582             hwloc_bitmap_free(mycpuset);
 583             return ORTE_ERR_NOT_FOUND;
 584         }
 585         sum = (opal_hwloc_topo_data_t*)root->userdata;
 586         if (NULL == sum->available) {
 587             /* another error */
 588             ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 589             hwloc_bitmap_free(mycpuset);
 590             return ORTE_ERR_NOT_FOUND;
 591         }
 592         /* the cpu list in sum->available has already been filtered
 593          * to include _only_ the cpus defined by the user */
 594         for (j=0; j < node->procs->size; j++) {
 595             if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
 596                 continue;
 597             }
 598             /* ignore procs from other jobs */
 599             if (proc->name.jobid != jdata->jobid) {
 600                 continue;
 601             }
 602             if (OPAL_BIND_ORDERED_REQUESTED(jdata->map->binding)) {
 603                 /* assign each proc, in local rank order, to
 604                  * the corresponding cpu in the list */
 605                 id = hwloc_bitmap_first(sum->available);
 606                 lrank = 0;
 607                 while (lrank != proc->local_rank) {
 608                     id = hwloc_bitmap_next(sum->available, id);
 609                     if ((unsigned)-1 == id) {
 610                         break;
 611                     }
 612                     ++lrank;
 613                 }
 614                 if ((unsigned)-1 ==id) {
 615                     /* ran out of cpus - that's an error */
 616                     orte_show_help("help-orte-rmaps-base.txt", "rmaps:insufficient-cpus", true,
 617                                    node->name, (int)proc->local_rank, opal_hwloc_base_cpu_list);
 618                     hwloc_bitmap_free(mycpuset);
 619                     return ORTE_ERR_OUT_OF_RESOURCE;
 620                 }
 621                  /* set the bit of interest */
 622                 hwloc_bitmap_only(mycpuset, id);
 623                 tset = mycpuset;
 624             } else {
 625                 /* bind the proc to all assigned cpus */
 626                 tset = sum->available;
 627             }
 628             hwloc_bitmap_list_asprintf(&cpu_bitmap, tset);
 629             orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
 630             if (NULL != cpu_bitmap) {
 631                 free(cpu_bitmap);
 632             }
 633         }
 634     }
 635     hwloc_bitmap_free(mycpuset);
 636     return ORTE_SUCCESS;
 637 }
 638 
 639 int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
 640 {
 641     hwloc_obj_type_t hwb;
 642     unsigned clvl=0;
 643     opal_binding_policy_t bind;
 644     orte_mapping_policy_t map;
 645     orte_node_t *node;
 646     int i, rc;
 647     struct hwloc_topology_support *support;
 648     int bind_depth;
 649 
 650     opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 651                         "mca:rmaps: compute bindings for job %s with policy %s[%x]",
 652                         ORTE_JOBID_PRINT(jdata->jobid),
 653                         opal_hwloc_base_print_binding(jdata->map->binding), jdata->map->binding);
 654 
 655     map = ORTE_GET_MAPPING_POLICY(jdata->map->mapping);
 656     bind = OPAL_GET_BINDING_POLICY(jdata->map->binding);
 657 
 658     if (ORTE_MAPPING_BYUSER == map) {
 659         /* user specified binding by rankfile - nothing for us to do */
 660         return ORTE_SUCCESS;
 661     }
 662 
 663     if (OPAL_BIND_TO_CPUSET == bind) {
 664         int rc;
 665         /* cpuset was given - setup the bindings */
 666         if (ORTE_SUCCESS != (rc = bind_to_cpuset(jdata))) {
 667             ORTE_ERROR_LOG(rc);
 668         }
 669         return rc;
 670     }
 671 
 672     if (OPAL_BIND_TO_NONE == bind) {
 673         /* no binding requested */
 674         return ORTE_SUCCESS;
 675     }
 676 
 677     if (OPAL_BIND_TO_BOARD == bind) {
 678         /* doesn't do anything at this time */
 679         return ORTE_SUCCESS;
 680     }
 681 
 682     /* binding requested - convert the binding level to the hwloc obj type */
 683     switch (bind) {
 684     case OPAL_BIND_TO_NUMA:
 685         hwb = HWLOC_OBJ_NODE;
 686         break;
 687     case OPAL_BIND_TO_SOCKET:
 688         hwb = HWLOC_OBJ_SOCKET;
 689         break;
 690     case OPAL_BIND_TO_L3CACHE:
 691         OPAL_HWLOC_MAKE_OBJ_CACHE(3, hwb, clvl);
 692         break;
 693     case OPAL_BIND_TO_L2CACHE:
 694         OPAL_HWLOC_MAKE_OBJ_CACHE(2, hwb, clvl);
 695         break;
 696     case OPAL_BIND_TO_L1CACHE:
 697         OPAL_HWLOC_MAKE_OBJ_CACHE(1, hwb, clvl);
 698         break;
 699     case OPAL_BIND_TO_CORE:
 700         hwb = HWLOC_OBJ_CORE;
 701         break;
 702     case OPAL_BIND_TO_HWTHREAD:
 703         hwb = HWLOC_OBJ_PU;
 704         break;
 705     default:
 706         ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
 707         return ORTE_ERR_BAD_PARAM;
 708     }
 709 
 710     /* if the job was mapped by the corresponding target, then
 711      * we bind in place
 712      *
 713      * otherwise, we have to bind either up or down the hwloc
 714      * tree. If we are binding upwards (e.g., mapped to hwthread
 715      * but binding to core), then we just climb the tree to find
 716      * the first matching object.
 717      *
 718      * if we are binding downwards (e.g., mapped to node and bind
 719      * to core), then we have to do a round-robin assigment of
 720      * procs to the resources below.
 721      */
 722 
 723     if (ORTE_MAPPING_BYDIST == map) {
 724         int rc = ORTE_SUCCESS;
 725         if (OPAL_BIND_TO_NUMA == bind) {
 726             opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 727                                 "mca:rmaps: bindings for job %s - dist to numa",
 728                                 ORTE_JOBID_PRINT(jdata->jobid));
 729             if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) {
 730                 ORTE_ERROR_LOG(rc);
 731             }
 732         } else if (OPAL_BIND_TO_NUMA < bind) {
 733             /* bind every proc downwards */
 734             goto execute;
 735         }
 736         /* if the binding policy is less than numa, then we are unbound - so
 737          * just ignore this and return (should have been caught in prior
 738          * tests anyway as only options meeting that criteria are "none"
 739          * and "board")
 740          */
 741         return rc;
 742     }
 743 
 744     /* now deal with the remaining binding policies based on hardware */
 745     if (bind == map) {
 746         opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 747                             "mca:rmaps: bindings for job %s - bind in place",
 748                             ORTE_JOBID_PRINT(jdata->jobid));
 749         if (ORTE_SUCCESS != (rc = bind_in_place(jdata, hwb, clvl))) {
 750             ORTE_ERROR_LOG(rc);
 751         }
 752         return rc;
 753     }
 754 
 755     /* we need to handle the remaining binding options on a per-node
 756      * basis because different nodes could potentially have different
 757      * topologies, with different relative depths for the two levels
 758      */
 759   execute:
 760     /* initialize */
 761     opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 762                         "mca:rmaps: computing bindings for job %s",
 763                         ORTE_JOBID_PRINT(jdata->jobid));
 764 
 765     for (i=0; i < jdata->map->nodes->size; i++) {
 766         if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
 767             continue;
 768         }
 769         if (!orte_no_vm && !orte_do_not_launch &&
 770             (int)ORTE_PROC_MY_NAME->vpid != node->index) {
 771             continue;
 772         }
 773         if (!orte_do_not_launch) {
 774             /* if we don't want to launch, then we are just testing the system,
 775              * so ignore questions about support capabilities
 776              */
 777             support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology->topo);
 778             /* check if topology supports cpubind - have to be careful here
 779              * as Linux doesn't currently support thread-level binding. This
 780              * may change in the future, though, and it isn't clear how hwloc
 781              * interprets the current behavior. So check both flags to be sure.
 782              */
 783             if (!support->cpubind->set_thisproc_cpubind &&
 784                 !support->cpubind->set_thisthread_cpubind) {
 785                 if (!OPAL_BINDING_REQUIRED(jdata->map->binding) ||
 786                     !OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
 787                     /* we are not required to bind, so ignore this */
 788                     continue;
 789                 }
 790                 orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
 791                 return ORTE_ERR_SILENT;
 792             }
 793             /* check if topology supports membind - have to be careful here
 794              * as hwloc treats this differently than I (at least) would have
 795              * expected. Per hwloc, Linux memory binding is at the thread,
 796              * and not process, level. Thus, hwloc sets the "thisproc" flag
 797              * to "false" on all Linux systems, and uses the "thisthread" flag
 798              * to indicate binding capability - don't warn if the user didn't
 799              * specifically request binding
 800              */
 801             if (!support->membind->set_thisproc_membind &&
 802                 !support->membind->set_thisthread_membind &&
 803                 OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
 804                 if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
 805                     orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
 806                     membind_warned = true;
 807                 } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
 808                     orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
 809                     return ORTE_ERR_SILENT;
 810                 }
 811             }
 812         }
 813 
 814         /* some systems do not report cores, and so we can get a situation where our
 815          * default binding policy will fail for no necessary reason. So if we are
 816          * computing a binding due to our default policy, and no cores are found
 817          * on this node, just silently skip it - we will not bind
 818          */
 819         if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) &&
 820             HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology->topo, HWLOC_OBJ_CORE)) {
 821             opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 822                                 "Unable to bind-to core by default on node %s as no cores detected",
 823                                 node->name);
 824             continue;
 825         }
 826 
 827         /* we share topologies in order
 828          * to save space, so we need to reset the usage info to reflect
 829          * our own current state
 830          */
 831         reset_usage(node, jdata->jobid);
 832 
 833         /* determine the relative depth on this node */
 834 #if HWLOC_API_VERSION < 0x20000
 835         if (HWLOC_OBJ_CACHE == hwb) {
 836             /* must use a unique function because blasted hwloc
 837              * just doesn't deal with caches very well...sigh
 838              */
 839             bind_depth = hwloc_get_cache_type_depth(node->topology->topo, clvl, (hwloc_obj_cache_type_t)-1);
 840         } else
 841 #endif
 842             bind_depth = hwloc_get_type_depth(node->topology->topo, hwb);
 843 #if HWLOC_API_VERSION < 0x20000
 844         if (0 > bind_depth)
 845 #else
 846         if (0 > bind_depth && HWLOC_TYPE_DEPTH_NUMANODE != bind_depth)
 847 #endif
 848         {
 849             /* didn't find such an object */
 850             orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects",
 851                            true, hwloc_obj_type_string(hwb), node->name);
 852             return ORTE_ERR_SILENT;
 853         }
 854         opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 855                             "%s bind_depth: %d",
 856                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 857                             bind_depth);
 858         if (ORTE_SUCCESS != (rc = bind_generic(jdata, node, bind_depth))) {
 859             ORTE_ERROR_LOG(rc);
 860             return rc;
 861         }
 862     }
 863 
 864     return ORTE_SUCCESS;
 865 }

/* [<][>][^][v][top][bottom][index][help] */