root/orte/mca/rmaps/round_robin/rmaps_rr_mappers.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. orte_rmaps_rr_byslot
  2. orte_rmaps_rr_bynode
  3. orte_rmaps_rr_byobj
  4. byobj_span

   1 /*
   2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2018 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2009-2013 Cisco Systems, Inc.  All rights reserved.
  13  * Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
  14  * Copyright (c) 2015      Research Organization for Information Science
  15  *                         and Technology (RIST). All rights reserved.
  16  * $COPYRIGHT$
  17  *
  18  * Additional copyrights may follow
  19  *
  20  * $HEADER$
  21  */
  22 
  23 #include "orte_config.h"
  24 #include "orte/constants.h"
  25 
  26 #include <string.h>
  27 
  28 #include "opal/util/output.h"
  29 #include "opal/mca/hwloc/base/base.h"
  30 
  31 #include "orte/util/show_help.h"
  32 #include "orte/util/name_fns.h"
  33 #include "orte/runtime/orte_globals.h"
  34 #include "orte/mca/errmgr/errmgr.h"
  35 
  36 #include "orte/mca/rmaps/base/rmaps_private.h"
  37 #include "orte/mca/rmaps/base/base.h"
  38 #include "rmaps_rr.h"
  39 
  40 int orte_rmaps_rr_byslot(orte_job_t *jdata,
  41                          orte_app_context_t *app,
  42                          opal_list_t *node_list,
  43                          orte_std_cntr_t num_slots,
  44                          orte_vpid_t num_procs)
  45 {
  46     int i, nprocs_mapped;
  47     orte_node_t *node;
  48     orte_proc_t *proc;
  49     int num_procs_to_assign, extra_procs_to_assign=0, nxtra_nodes=0;
  50     hwloc_obj_t obj=NULL;
  51     float balance;
  52     bool add_one=false;
  53 
  54     opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
  55                         "mca:rmaps:rr: mapping by slot for job %s slots %d num_procs %lu",
  56                         ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs);
  57 
  58     /* check to see if we can map all the procs */
  59     if (num_slots < (int)app->num_procs) {
  60         if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
  61             orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
  62                            true, app->num_procs, app->app, orte_process_info.nodename);
  63             ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
  64             return ORTE_ERR_SILENT;
  65         }
  66     }
  67 
  68     /* first pass: map the number of procs to each node until we
  69      * map all specified procs or use all allocated slots
  70      */
  71     nprocs_mapped = 0;
  72     OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
  73         opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
  74                             "mca:rmaps:rr:slot working node %s",
  75                             node->name);
  76         /* get the root object as we are not assigning
  77          * locale here except at the node level
  78          */
  79         if (NULL != node->topology && NULL != node->topology->topo) {
  80             obj = hwloc_get_root_obj(node->topology->topo);
  81         }
  82         if (node->slots <= node->slots_inuse) {
  83             opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
  84                                 "mca:rmaps:rr:slot node %s is full - skipping",
  85                                 node->name);
  86             continue;
  87         }
  88         if (orte_rmaps_base_pernode) {
  89             num_procs_to_assign = 1;
  90         } else if (0 < orte_rmaps_base_n_pernode) {
  91             num_procs_to_assign = orte_rmaps_base_n_pernode;
  92         } else if (0 < orte_rmaps_base_n_persocket) {
  93             if (NULL == node->topology) {
  94                 orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
  95                                true, node->name);
  96                 return ORTE_ERR_SILENT;
  97             }
  98             num_procs_to_assign = orte_rmaps_base_n_persocket * opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_PACKAGE, 0, OPAL_HWLOC_AVAILABLE);
  99         } else {
 100             /* assign a number of procs equal to the number of available slots */
 101             num_procs_to_assign = node->slots - node->slots_inuse;
 102         }
 103         opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
 104                             "mca:rmaps:rr:slot assigning %d procs to node %s",
 105                             (int)num_procs_to_assign, node->name);
 106 
 107         for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
 108             /* add this node to the map - do it only once */
 109             if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
 110                 ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
 111                 OBJ_RETAIN(node);
 112                 opal_pointer_array_add(jdata->map->nodes, node);
 113                 ++(jdata->map->num_nodes);
 114             }
 115             if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
 116                 return ORTE_ERR_OUT_OF_RESOURCE;
 117             }
 118             nprocs_mapped++;
 119             orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
 120         }
 121     }
 122 
 123     if (nprocs_mapped == app->num_procs) {
 124         /* we are done */
 125         return ORTE_SUCCESS;
 126     }
 127 
 128     opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
 129                         "mca:rmaps:rr:slot job %s is oversubscribed - performing second pass",
 130                         ORTE_JOBID_PRINT(jdata->jobid));
 131 
 132     /* second pass: if we haven't mapped everyone yet, it is
 133      * because we are oversubscribed. Figure out how many procs
 134      * to add
 135      */
 136     balance = (float)((int)app->num_procs - nprocs_mapped) / (float)opal_list_get_size(node_list);
 137     extra_procs_to_assign = (int)balance;
 138     if (0 < (balance - (float)extra_procs_to_assign)) {
 139         /* compute how many nodes need an extra proc */
 140         nxtra_nodes = app->num_procs - nprocs_mapped - (extra_procs_to_assign * opal_list_get_size(node_list));
 141         /* add one so that we add an extra proc to the first nodes
 142          * until all procs are mapped
 143          */
 144         extra_procs_to_assign++;
 145         /* flag that we added one */
 146         add_one = true;
 147     }
 148 
 149     OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
 150         opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
 151                             "mca:rmaps:rr:slot working node %s",
 152                             node->name);
 153         /* get the root object as we are not assigning
 154          * locale except at the node level
 155          */
 156         if (NULL != node->topology && NULL != node->topology->topo) {
 157             obj = hwloc_get_root_obj(node->topology->topo);
 158         }
 159 
 160         /* add this node to the map - do it only once */
 161         if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
 162             ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
 163             OBJ_RETAIN(node);
 164             opal_pointer_array_add(jdata->map->nodes, node);
 165             ++(jdata->map->num_nodes);
 166         }
 167         if (add_one) {
 168             if (0 == nxtra_nodes) {
 169                 --extra_procs_to_assign;
 170                 add_one = false;
 171             } else {
 172                 --nxtra_nodes;
 173             }
 174         }
 175         if(node->slots <= node->slots_inuse) {
 176             /* nodes are already oversubscribed */
 177             num_procs_to_assign = extra_procs_to_assign;
 178         }
 179         else {
 180             /* nodes have some room */
 181             num_procs_to_assign = node->slots - node->slots_inuse + extra_procs_to_assign;
 182         }
 183         opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
 184                             "mca:rmaps:rr:slot adding up to %d procs to node %s",
 185                             num_procs_to_assign, node->name);
 186         for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
 187             if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
 188                 return ORTE_ERR_OUT_OF_RESOURCE;
 189             }
 190             nprocs_mapped++;
 191             orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
 192         }
 193         /* not all nodes are equal, so only set oversubscribed for
 194          * this node if it is in that state
 195          */
 196         if (node->slots < (int)node->num_procs) {
 197             /* flag the node as oversubscribed so that sched-yield gets
 198              * properly set
 199              */
 200             ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
 201             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
 202             /* check for permission */
 203             if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
 204                 /* if we weren't given a directive either way, then we will error out
 205                  * as the #slots were specifically given, either by the host RM or
 206                  * via hostfile/dash-host */
 207                 if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) {
 208                     orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
 209                                    true, app->num_procs, app->app, orte_process_info.nodename);
 210                     ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 211                     return ORTE_ERR_SILENT;
 212                 } else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
 213                     /* if we were explicitly told not to oversubscribe, then don't */
 214                     orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
 215                                    true, app->num_procs, app->app, orte_process_info.nodename);
 216                     ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 217                     return ORTE_ERR_SILENT;
 218                 }
 219             }
 220         }
 221         /* if we have mapped everything, then we are done */
 222         if (nprocs_mapped == app->num_procs) {
 223             break;
 224         }
 225     }
 226     return ORTE_SUCCESS;
 227 }
 228 
 229 int orte_rmaps_rr_bynode(orte_job_t *jdata,
 230                          orte_app_context_t *app,
 231                          opal_list_t *node_list,
 232                          orte_std_cntr_t num_slots,
 233                          orte_vpid_t num_procs)
 234 {
 235     int j, nprocs_mapped, nnodes;
 236     orte_node_t *node;
 237     orte_proc_t *proc;
 238     int num_procs_to_assign, navg;
 239     int extra_procs_to_assign=0, nxtra_nodes=0;
 240     hwloc_obj_t obj=NULL;
 241     float balance;
 242     bool add_one=false;
 243     bool oversubscribed=false;
 244 
 245     opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
 246                         "mca:rmaps:rr: mapping by node for job %s app %d slots %d num_procs %lu",
 247                         ORTE_JOBID_PRINT(jdata->jobid), (int)app->idx,
 248                         (int)num_slots, (unsigned long)num_procs);
 249 
 250     /* quick check to see if we can map all the procs */
 251     if (num_slots < (int)app->num_procs) {
 252         if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
 253             orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
 254                            true, app->num_procs, app->app, orte_process_info.nodename);
 255             ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 256             return ORTE_ERR_SILENT;
 257         }
 258         oversubscribed = true;
 259     }
 260 
 261     nnodes = opal_list_get_size(node_list);
 262     nprocs_mapped = 0;
 263 
 264     do {
 265         /* divide the procs evenly across all nodes - this is the
 266          * average we have to maintain as we go, but we adjust
 267          * the number on each node to reflect its available slots.
 268          * Obviously, if all nodes have the same number of slots,
 269          * then the avg is what we get on each node - this is
 270          * the most common situation.
 271          */
 272         navg = ((int)app->num_procs - nprocs_mapped) / nnodes;
 273         if (0 == navg) {
 274             /* if there are less procs than nodes, we have to
 275              * place at least one/node
 276              */
 277             navg = 1;
 278         }
 279 
 280         /* compute how many extra procs to put on each node */
 281         balance = (float)(((int)app->num_procs - nprocs_mapped) - (navg * nnodes)) / (float)nnodes;
 282         extra_procs_to_assign = (int)balance;
 283         nxtra_nodes = 0;
 284         add_one = false;
 285         if (0 < (balance - (float)extra_procs_to_assign)) {
 286             /* compute how many nodes need an extra proc */
 287             nxtra_nodes = ((int)app->num_procs - nprocs_mapped) - ((navg + extra_procs_to_assign) * nnodes);
 288             /* add one so that we add an extra proc to the first nodes
 289              * until all procs are mapped
 290              */
 291             extra_procs_to_assign++;
 292             /* flag that we added one */
 293             add_one = true;
 294         }
 295 
 296         opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
 297                             "mca:rmaps:rr: mapping by node navg %d extra_procs %d extra_nodes %d",
 298                             navg, extra_procs_to_assign, nxtra_nodes);
 299 
 300         nnodes = 0;
 301         OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
 302             /* get the root object as we are not assigning
 303              * locale except at the node level
 304              */
 305             if (NULL != node->topology && NULL != node->topology->topo) {
 306                 obj = hwloc_get_root_obj(node->topology->topo);
 307             }
 308             /* add this node to the map, but only do so once */
 309             if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
 310                 ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
 311                 OBJ_RETAIN(node);
 312                 opal_pointer_array_add(jdata->map->nodes, node);
 313                 ++(jdata->map->num_nodes);
 314             }
 315             if (orte_rmaps_base_pernode) {
 316                 num_procs_to_assign = 1;
 317             } else if (0 < orte_rmaps_base_n_pernode) {
 318                 num_procs_to_assign = orte_rmaps_base_n_pernode;
 319             } else if (0 < orte_rmaps_base_n_persocket) {
 320                 num_procs_to_assign = orte_rmaps_base_n_persocket * opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_PACKAGE, 0, OPAL_HWLOC_AVAILABLE);
 321             } else  if (oversubscribed) {
 322                 /* compute the number of procs to go on this node */
 323                 if (add_one) {
 324                     if (0 == nxtra_nodes) {
 325                         --extra_procs_to_assign;
 326                         add_one = false;
 327                     } else {
 328                         --nxtra_nodes;
 329                     }
 330                 }
 331                 /* everybody just takes their share */
 332                 num_procs_to_assign = navg + extra_procs_to_assign;
 333             } else if (node->slots <= node->slots_inuse) {
 334                 /* since we are not oversubcribed, ignore this node */
 335                 continue;
 336             } else {
 337                 /* if we are not oversubscribed, then there are enough
 338                  * slots to handle all the procs. However, not every
 339                  * node will have the same number of slots, so we
 340                  * have to track how many procs to "shift" elsewhere
 341                  * to make up the difference
 342                  */
 343 
 344                 /* compute the number of procs to go on this node */
 345                 if (add_one) {
 346                     if (0 == nxtra_nodes) {
 347                         --extra_procs_to_assign;
 348                         add_one = false;
 349                     } else {
 350                         --nxtra_nodes;
 351                     }
 352                 }
 353                 /* if slots < avg + extra (adjusted for cpus/proc), then try to take all */
 354                 if ((node->slots - node->slots_inuse) < (navg + extra_procs_to_assign)) {
 355                     num_procs_to_assign = node->slots - node->slots_inuse;
 356                     /* if we can't take any proc, skip following steps */
 357                     if (num_procs_to_assign == 0) {
 358                         continue;
 359                     }
 360                 } else {
 361                 /* take the avg + extra */
 362                     num_procs_to_assign = navg + extra_procs_to_assign;
 363                 }
 364                 OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output,
 365                                      "%s NODE %s AVG %d ASSIGN %d EXTRA %d",
 366                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name,
 367                                      navg, num_procs_to_assign, extra_procs_to_assign));
 368             }
 369             nnodes++; // track how many nodes remain available
 370             OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output,
 371                                  "%s NODE %s ASSIGNING %d",
 372                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name,
 373                                  num_procs_to_assign));
 374             for (j=0; j < num_procs_to_assign && nprocs_mapped < app->num_procs; j++) {
 375                 if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
 376                     return ORTE_ERR_OUT_OF_RESOURCE;
 377                 }
 378                 nprocs_mapped++;
 379                 orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
 380             }
 381             /* not all nodes are equal, so only set oversubscribed for
 382              * this node if it is in that state
 383              */
 384             if (node->slots < (int)node->num_procs) {
 385                 /* flag the node as oversubscribed so that sched-yield gets
 386                  * properly set
 387                  */
 388                 ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
 389                 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
 390                 /* check for permission */
 391                 if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
 392                     /* if we weren't given a directive either way, then we will error out
 393                      * as the #slots were specifically given, either by the host RM or
 394                      * via hostfile/dash-host */
 395                     if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) {
 396                         orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
 397                                        true, app->num_procs, app->app, orte_process_info.nodename);
 398                         ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 399                         return ORTE_ERR_SILENT;
 400                     } else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
 401                         /* if we were explicitly told not to oversubscribe, then don't */
 402                         orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
 403                                        true, app->num_procs, app->app, orte_process_info.nodename);
 404                         ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 405                         return ORTE_ERR_SILENT;
 406                     }
 407                 }
 408             }
 409             if (nprocs_mapped == app->num_procs) {
 410                 /* we are done */
 411                 break;
 412             }
 413         }
 414     } while (nprocs_mapped < app->num_procs && 0 < nnodes);
 415 
 416     /* now fillin as required until fully mapped */
 417     while (nprocs_mapped < app->num_procs) {
 418         OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
 419             /* get the root object as we are not assigning
 420              * locale except at the node level
 421              */
 422             if (NULL != node->topology && NULL != node->topology->topo) {
 423                 obj = hwloc_get_root_obj(node->topology->topo);
 424             }
 425 
 426            OPAL_OUTPUT_VERBOSE((20, orte_rmaps_base_framework.framework_output,
 427                                  "%s ADDING PROC TO NODE %s",
 428                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
 429             if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
 430                 return ORTE_ERR_OUT_OF_RESOURCE;
 431             }
 432             nprocs_mapped++;
 433             orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
 434             /* not all nodes are equal, so only set oversubscribed for
 435              * this node if it is in that state
 436              */
 437             if (node->slots < (int)node->num_procs) {
 438                 /* flag the node as oversubscribed so that sched-yield gets
 439                  * properly set
 440                  */
 441                 ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
 442                 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
 443             }
 444             if (nprocs_mapped == app->num_procs) {
 445                 /* we are done */
 446                 break;
 447             }
 448         }
 449     }
 450 
 451     return ORTE_SUCCESS;
 452 }
 453 
 454 static  int byobj_span(orte_job_t *jdata,
 455                        orte_app_context_t *app,
 456                        opal_list_t *node_list,
 457                        orte_std_cntr_t num_slots,
 458                        orte_vpid_t num_procs,
 459                        hwloc_obj_type_t target, unsigned cache_level);
 460 
 461 /* mapping by hwloc object looks a lot like mapping by node,
 462  * but has the added complication of possibly having different
 463  * numbers of objects on each node
 464  */
 465 int orte_rmaps_rr_byobj(orte_job_t *jdata,
 466                         orte_app_context_t *app,
 467                         opal_list_t *node_list,
 468                         orte_std_cntr_t num_slots,
 469                         orte_vpid_t num_procs,
 470                         hwloc_obj_type_t target, unsigned cache_level)
 471 {
 472     int i, nmapped, nprocs_mapped;
 473     orte_node_t *node;
 474     orte_proc_t *proc;
 475     int nprocs, start;
 476     hwloc_obj_t obj=NULL;
 477     unsigned int nobjs;
 478     bool add_one;
 479     bool second_pass;
 480 
 481     /* there are two modes for mapping by object: span and not-span. The
 482      * span mode essentially operates as if there was just a single
 483      * "super-node" in the system - i.e., it balances the load across
 484      * all objects of the indicated type regardless of their location.
 485      * In essence, it acts as if we placed one proc on each object, cycling
 486      * across all objects on all nodes, and then wrapped around to place
 487      * another proc on each object, doing so until all procs were placed.
 488      *
 489      * In contrast, the non-span mode operates similar to byslot mapping.
 490      * All slots on each node are filled, assigning each proc to an object
 491      * on that node in a balanced fashion, and then the mapper moves on
 492      * to the next node. Thus, procs tend to be "front loaded" onto the
 493      * list of nodes, as opposed to being "load balanced" in the span mode
 494      */
 495     if (ORTE_MAPPING_SPAN & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
 496         return byobj_span(jdata, app, node_list, num_slots,
 497                           num_procs, target, cache_level);
 498     }
 499 
 500     opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
 501                         "mca:rmaps:rr: mapping no-span by %s for job %s slots %d num_procs %lu",
 502                         hwloc_obj_type_string(target),
 503                         ORTE_JOBID_PRINT(jdata->jobid),
 504                         (int)num_slots, (unsigned long)num_procs);
 505 
 506     /* quick check to see if we can map all the procs */
 507     if (num_slots < app->num_procs) {
 508         if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
 509             orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
 510                            true, app->num_procs, app->app, orte_process_info.nodename);
 511             ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 512             return ORTE_ERR_SILENT;
 513         }
 514     }
 515 
 516     /* we know we have enough slots, or that oversubscrption is allowed, so
 517      * start mapping procs onto objects, filling each object as we go until
 518      * all procs are mapped. If one pass doesn't catch all the required procs,
 519      * then loop thru the list again to handle the oversubscription
 520      */
 521     nprocs_mapped = 0;
 522     second_pass = false;
 523     do {
 524         add_one = false;
 525         OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
 526             if (NULL == node->topology || NULL == node->topology->topo) {
 527                 orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
 528                                true, node->name);
 529                 return ORTE_ERR_SILENT;
 530             }
 531             start = 0;
 532             /* get the number of objects of this type on this node */
 533             nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, cache_level, OPAL_HWLOC_AVAILABLE);
 534             if (0 == nobjs) {
 535                 continue;
 536             }
 537             opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
 538                                 "mca:rmaps:rr: found %u %s objects on node %s",
 539                                 nobjs, hwloc_obj_type_string(target), node->name);
 540 
 541             /* if this is a comm_spawn situation, start with the object
 542              * where the parent left off and increment */
 543             if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
 544                 start = (jdata->bkmark_obj + 1) % nobjs;
 545             }
 546             /* compute the number of procs to go on this node */
 547             if (orte_rmaps_base_pernode) {
 548                 nprocs = 1;
 549             } else if (0 < orte_rmaps_base_n_pernode) {
 550                 nprocs = orte_rmaps_base_n_pernode;
 551             } else if (0 < orte_rmaps_base_n_persocket) {
 552                 if (HWLOC_OBJ_PACKAGE == target) {
 553                     nprocs = orte_rmaps_base_n_persocket * nobjs;
 554                 } else {
 555                     nprocs = orte_rmaps_base_n_persocket * opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_PACKAGE, 0, OPAL_HWLOC_AVAILABLE);
 556                 }
 557             } else {
 558                 nprocs = node->slots - node->slots_inuse;
 559             }
 560             opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
 561                                 "mca:rmaps:rr: calculated nprocs %d", nprocs);
 562             if (nprocs < 1) {
 563                 if (second_pass) {
 564                     /* already checked for oversubscription permission, so at least put
 565                      * one proc on it
 566                      */
 567                     nprocs = 1;
 568                     /* offset our starting object position to avoid always
 569                      * hitting the first one
 570                      */
 571                     start = node->num_procs % nobjs;
 572                 } else {
 573                     continue;
 574                 }
 575             }
 576             /* add this node to the map, if reqd */
 577             if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
 578                 ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
 579                 OBJ_RETAIN(node);
 580                 opal_pointer_array_add(jdata->map->nodes, node);
 581                 ++(jdata->map->num_nodes);
 582             }
 583             nmapped = 0;
 584             opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
 585                                 "mca:rmaps:rr: assigning nprocs %d", nprocs);
 586             do {
 587                 /* loop through the number of objects */
 588                 for (i=0; i < (int)nobjs && nmapped < nprocs && nprocs_mapped < (int)app->num_procs; i++) {
 589                     opal_output_verbose(20, orte_rmaps_base_framework.framework_output,
 590                                         "mca:rmaps:rr: assigning proc to object %d", (i+start) % nobjs);
 591                     /* get the hwloc object */
 592                     if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, cache_level, (i+start) % nobjs, OPAL_HWLOC_AVAILABLE))) {
 593                         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 594                         return ORTE_ERR_NOT_FOUND;
 595                     }
 596                     if (orte_rmaps_base.cpus_per_rank > (int)opal_hwloc_base_get_npus(node->topology->topo, obj)) {
 597                         orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low", true,
 598                                        orte_rmaps_base.cpus_per_rank, opal_hwloc_base_get_npus(node->topology->topo, obj),
 599                                        orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
 600                         return ORTE_ERR_SILENT;
 601                     }
 602                     if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
 603                         return ORTE_ERR_OUT_OF_RESOURCE;
 604                     }
 605                     nprocs_mapped++;
 606                     nmapped++;
 607                     orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
 608                 }
 609             } while (nmapped < nprocs && nprocs_mapped < (int)app->num_procs);
 610             add_one = true;
 611             /* not all nodes are equal, so only set oversubscribed for
 612              * this node if it is in that state
 613              */
 614             if (node->slots < (int)node->num_procs) {
 615                 /* flag the node as oversubscribed so that sched-yield gets
 616                  * properly set
 617                  */
 618                 ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
 619                 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
 620                 /* check for permission */
 621                 if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
 622                     /* if we weren't given a directive either way, then we will error out
 623                      * as the #slots were specifically given, either by the host RM or
 624                      * via hostfile/dash-host */
 625                     if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) {
 626                         orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
 627                                        true, app->num_procs, app->app, orte_process_info.nodename);
 628                         ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 629                         return ORTE_ERR_SILENT;
 630                     } else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
 631                         /* if we were explicitly told not to oversubscribe, then don't */
 632                         orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
 633                                        true, app->num_procs, app->app, orte_process_info.nodename);
 634                         ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 635                         return ORTE_ERR_SILENT;
 636                     }
 637                 }
 638             }
 639             if (nprocs_mapped == app->num_procs) {
 640                 /* we are done */
 641                 break;
 642             }
 643         }
 644         second_pass = true;
 645     } while (add_one && nprocs_mapped < app->num_procs);
 646 
 647     if (nprocs_mapped < app->num_procs) {
 648         /* usually means there were no objects of the requested type */
 649         return ORTE_ERR_NOT_FOUND;
 650     }
 651 
 652     return ORTE_SUCCESS;
 653 }
 654 
 655 static int byobj_span(orte_job_t *jdata,
 656                       orte_app_context_t *app,
 657                       opal_list_t *node_list,
 658                       orte_std_cntr_t num_slots,
 659                       orte_vpid_t num_procs,
 660                       hwloc_obj_type_t target, unsigned cache_level)
 661 {
 662     int i, j, nprocs_mapped, navg;
 663     orte_node_t *node;
 664     orte_proc_t *proc;
 665     int nprocs, nxtra_objs;
 666     hwloc_obj_t obj=NULL;
 667     unsigned int nobjs;
 668 
 669     opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
 670                         "mca:rmaps:rr: mapping span by %s for job %s slots %d num_procs %lu",
 671                         hwloc_obj_type_string(target),
 672                         ORTE_JOBID_PRINT(jdata->jobid),
 673                         (int)num_slots, (unsigned long)num_procs);
 674 
 675     /* quick check to see if we can map all the procs */
 676     if (num_slots < (int)app->num_procs) {
 677         if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
 678             orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
 679                            true, app->num_procs, app->app, orte_process_info.nodename);
 680             ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 681             return ORTE_ERR_SILENT;
 682         }
 683     }
 684 
 685     /* we know we have enough slots, or that oversubscrption is allowed, so
 686      * next determine how many total objects we have to work with
 687      */
 688     nobjs = 0;
 689     OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
 690         if (NULL == node->topology || NULL == node->topology->topo) {
 691             orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
 692                            true, node->name);
 693             return ORTE_ERR_SILENT;
 694         }
 695         /* get the number of objects of this type on this node */
 696         nobjs += opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, cache_level, OPAL_HWLOC_AVAILABLE);
 697     }
 698 
 699     if (0 == nobjs) {
 700         return ORTE_ERR_NOT_FOUND;
 701     }
 702 
 703     /* divide the procs evenly across all objects */
 704     navg = app->num_procs / nobjs;
 705     if (0 == navg) {
 706         /* if there are less procs than objects, we have to
 707          * place at least one/obj
 708          */
 709         navg = 1;
 710     }
 711 
 712     /* compute how many objs need an extra proc */
 713     if (0 > (nxtra_objs = app->num_procs - (navg * nobjs))) {
 714         nxtra_objs = 0;
 715     }
 716 
 717     opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
 718                         "mca:rmaps:rr: mapping by %s navg %d extra_objs %d",
 719                         hwloc_obj_type_string(target),
 720                         navg, nxtra_objs);
 721 
 722     nprocs_mapped = 0;
 723     OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
 724         /* add this node to the map, if reqd */
 725         if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
 726             ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
 727             OBJ_RETAIN(node);
 728             opal_pointer_array_add(jdata->map->nodes, node);
 729             ++(jdata->map->num_nodes);
 730         }
 731         /* get the number of objects of this type on this node */
 732         nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, cache_level, OPAL_HWLOC_AVAILABLE);
 733         opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
 734                             "mca:rmaps:rr:byobj: found %d objs on node %s", nobjs, node->name);
 735         /* loop through the number of objects */
 736         for (i=0; i < (int)nobjs && nprocs_mapped < (int)app->num_procs; i++) {
 737             /* get the hwloc object */
 738             if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, cache_level, i, OPAL_HWLOC_AVAILABLE))) {
 739                 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 740                 return ORTE_ERR_NOT_FOUND;
 741             }
 742             if (orte_rmaps_base.cpus_per_rank > (int)opal_hwloc_base_get_npus(node->topology->topo, obj)) {
 743                 orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low", true,
 744                                orte_rmaps_base.cpus_per_rank, opal_hwloc_base_get_npus(node->topology->topo, obj),
 745                                orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
 746                 return ORTE_ERR_SILENT;
 747             }
 748             /* determine how many to map */
 749             if (orte_rmaps_base_pernode) {
 750                 nprocs = 1;
 751             } else if (0 < orte_rmaps_base_n_pernode) {
 752                 nprocs = orte_rmaps_base_n_pernode;
 753             } else if (0 < orte_rmaps_base_n_persocket) {
 754                 if (HWLOC_OBJ_PACKAGE == target) {
 755                     nprocs = orte_rmaps_base_n_persocket * nobjs;
 756                 } else {
 757                     nprocs = orte_rmaps_base_n_persocket * opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_PACKAGE, 0, OPAL_HWLOC_AVAILABLE);
 758                 }
 759             } else {
 760                 nprocs = navg;
 761             }
 762             if (0 < nxtra_objs) {
 763                 nprocs++;
 764                 nxtra_objs--;
 765             }
 766             /* map the reqd number of procs */
 767             for (j=0; j < nprocs && nprocs_mapped < app->num_procs; j++) {
 768                 if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
 769                     return ORTE_ERR_OUT_OF_RESOURCE;
 770                 }
 771                 nprocs_mapped++;
 772                 orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
 773             }
 774             /* keep track of the node we last used */
 775             jdata->bookmark = node;
 776         }
 777         /* not all nodes are equal, so only set oversubscribed for
 778          * this node if it is in that state
 779          */
 780         if (node->slots < (int)node->num_procs) {
 781             /* flag the node as oversubscribed so that sched-yield gets
 782              * properly set
 783              */
 784             ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
 785             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
 786         }
 787         if (nprocs_mapped == app->num_procs) {
 788             /* we are done */
 789             break;
 790         }
 791     }
 792 
 793     return ORTE_SUCCESS;
 794 }

/* [<][>][^][v][top][bottom][index][help] */