root/orte/mca/rmaps/ppr/rmaps_ppr.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ppr_mapper
  2. find_split
  3. prune
  4. assign_locations

   1 /*
   2  * Copyright (c) 2011      Cisco Systems, Inc.  All rights reserved.
   3  * Copyright (c) 2011      Los Alamos National Security, LLC.
   4  *                         All rights reserved.
   5  * Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
   6  * Copyright (c) 2015-2017 Research Organization for Information Science
   7  *                         and Technology (RIST). All rights reserved.
   8  * $COPYRIGHT$
   9  *
  10  * Additional copyrights may follow
  11  *
  12  * $HEADER$
  13  */
  14 
  15 #include "orte_config.h"
  16 #include "orte/constants.h"
  17 #include "orte/types.h"
  18 
  19 #include <errno.h>
  20 #ifdef HAVE_UNISTD_H
  21 #include <unistd.h>
  22 #endif  /* HAVE_UNISTD_H */
  23 #include <string.h>
  24 
  25 #include "opal/mca/hwloc/base/base.h"
  26 #include "opal/util/argv.h"
  27 
  28 #include "orte/util/show_help.h"
  29 #include "orte/mca/errmgr/errmgr.h"
  30 
  31 #include "orte/mca/rmaps/base/rmaps_private.h"
  32 #include "orte/mca/rmaps/base/base.h"
  33 #include "rmaps_ppr.h"
  34 
  35 static int ppr_mapper(orte_job_t *jdata);
  36 static int assign_locations(orte_job_t *jdata);
  37 
  38 orte_rmaps_base_module_t orte_rmaps_ppr_module = {
  39     .map_job = ppr_mapper,
  40     .assign_locations = assign_locations
  41 };
  42 
  43 /* RHC: will eventually remove this
  44  * definition as it is no longer reqd
  45  * in the rest of OMPI system.
  46  *
  47  * Define a hierarchical level value that
  48  * helps resolve the hwloc behavior of
  49  * treating caches as a single type of
  50  * entity - must always be available
  51  */
  52 typedef enum {
  53     OPAL_HWLOC_NODE_LEVEL=0,
  54     OPAL_HWLOC_NUMA_LEVEL,
  55     OPAL_HWLOC_SOCKET_LEVEL,
  56     OPAL_HWLOC_L3CACHE_LEVEL,
  57     OPAL_HWLOC_L2CACHE_LEVEL,
  58     OPAL_HWLOC_L1CACHE_LEVEL,
  59     OPAL_HWLOC_CORE_LEVEL,
  60     OPAL_HWLOC_HWTHREAD_LEVEL
  61 } opal_hwloc_level_t;
  62 
  63 static void prune(orte_jobid_t jobid,
  64                   orte_app_idx_t app_idx,
  65                   orte_node_t *node,
  66                   opal_hwloc_level_t *level,
  67                   orte_vpid_t *nmapped);
  68 
  69 static int ppr[OPAL_HWLOC_HWTHREAD_LEVEL+1];
  70 
  71 static int ppr_mapper(orte_job_t *jdata)
  72 {
  73     int rc = ORTE_SUCCESS, j, n;
  74     mca_base_component_t *c=&mca_rmaps_ppr_component.base_version;
  75     orte_node_t *node;
  76     orte_proc_t *proc;
  77     orte_app_context_t *app;
  78     orte_vpid_t total_procs, nprocs_mapped;
  79     opal_hwloc_level_t start=OPAL_HWLOC_NODE_LEVEL;
  80     hwloc_obj_t obj;
  81     hwloc_obj_type_t lowest;
  82     unsigned cache_level=0;
  83     unsigned int nobjs, i;
  84     bool pruning_reqd = false;
  85     opal_hwloc_level_t level;
  86     opal_list_t node_list;
  87     opal_list_item_t *item;
  88     orte_std_cntr_t num_slots;
  89     orte_app_idx_t idx;
  90     char **ppr_req, **ck;
  91     size_t len;
  92     bool initial_map=true;
  93 
  94     /* only handle initial launch of loadbalanced
  95      * or NPERxxx jobs - allow restarting of failed apps
  96      */
  97     if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
  98         opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
  99                             "mca:rmaps:ppr: job %s being restarted - ppr cannot map",
 100                             ORTE_JOBID_PRINT(jdata->jobid));
 101         return ORTE_ERR_TAKE_NEXT_OPTION;
 102     }
 103     if (NULL != jdata->map->req_mapper &&
 104         0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
 105         /* a mapper has been specified, and it isn't me */
 106         opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 107                             "mca:rmaps:ppr: job %s not using ppr mapper",
 108                             ORTE_JOBID_PRINT(jdata->jobid));
 109         return ORTE_ERR_TAKE_NEXT_OPTION;
 110     }
 111     if (NULL == jdata->map->ppr ||
 112         ORTE_MAPPING_PPR != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
 113         /* not for us */
 114         opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 115                             "mca:rmaps:ppr: job %s not using ppr mapper PPR %s policy %s",
 116                             ORTE_JOBID_PRINT(jdata->jobid),
 117                             (NULL == jdata->map->ppr) ? "NULL" : jdata->map->ppr,
 118                             (ORTE_MAPPING_PPR == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) ? "PPRSET" : "PPR NOTSET");
 119         return ORTE_ERR_TAKE_NEXT_OPTION;
 120     }
 121 
 122     opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 123                         "mca:rmaps:ppr: mapping job %s with ppr %s",
 124                         ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr);
 125 
 126     /* flag that I did the mapping */
 127     if (NULL != jdata->map->last_mapper) {
 128         free(jdata->map->last_mapper);
 129     }
 130     jdata->map->last_mapper = strdup(c->mca_component_name);
 131 
 132     /* initialize */
 133     memset(ppr, 0, OPAL_HWLOC_HWTHREAD_LEVEL * sizeof(opal_hwloc_level_t));
 134 
 135     /* parse option */
 136     n=0;
 137     ppr_req = opal_argv_split(jdata->map->ppr, ',');
 138     for (j=0; NULL != ppr_req[j]; j++) {
 139         /* split on the colon */
 140         ck = opal_argv_split(ppr_req[j], ':');
 141         if (2 != opal_argv_count(ck)) {
 142             /* must provide a specification */
 143             orte_show_help("help-orte-rmaps-ppr.txt", "invalid-ppr", true, jdata->map->ppr);
 144             opal_argv_free(ppr_req);
 145             opal_argv_free(ck);
 146             return ORTE_ERR_SILENT;
 147         }
 148         len = strlen(ck[1]);
 149         if (0 == strncasecmp(ck[1], "node", len)) {
 150             ppr[OPAL_HWLOC_NODE_LEVEL] = strtol(ck[0], NULL, 10);
 151             ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNODE);
 152             start = OPAL_HWLOC_NODE_LEVEL;
 153             n++;
 154         } else if (0 == strncasecmp(ck[1], "hwthread", len) ||
 155                    0 == strncasecmp(ck[1], "thread", len)) {
 156             ppr[OPAL_HWLOC_HWTHREAD_LEVEL] = strtol(ck[0], NULL, 10);
 157             start = OPAL_HWLOC_HWTHREAD_LEVEL;
 158             ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYHWTHREAD);
 159             n++;
 160         } else if (0 == strncasecmp(ck[1], "core", len)) {
 161             ppr[OPAL_HWLOC_CORE_LEVEL] = strtol(ck[0], NULL, 10);
 162             if (start < OPAL_HWLOC_CORE_LEVEL) {
 163                 start = OPAL_HWLOC_CORE_LEVEL;
 164                 ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYCORE);
 165             }
 166             n++;
 167         } else if (0 == strncasecmp(ck[1], "socket", len) ||
 168                    0 == strncasecmp(ck[1], "skt", len)) {
 169             ppr[OPAL_HWLOC_SOCKET_LEVEL] = strtol(ck[0], NULL, 10);
 170             if (start < OPAL_HWLOC_SOCKET_LEVEL) {
 171                 start = OPAL_HWLOC_SOCKET_LEVEL;
 172                 ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSOCKET);
 173             }
 174             n++;
 175         } else if (0 == strncasecmp(ck[1], "l1cache", len)) {
 176             ppr[OPAL_HWLOC_L1CACHE_LEVEL] = strtol(ck[0], NULL, 10);
 177             if (start < OPAL_HWLOC_L1CACHE_LEVEL) {
 178                 start = OPAL_HWLOC_L1CACHE_LEVEL;
 179                 cache_level = 1;
 180                 ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL1CACHE);
 181             }
 182             n++;
 183         } else if (0 == strncasecmp(ck[1], "l2cache", len)) {
 184             ppr[OPAL_HWLOC_L2CACHE_LEVEL] = strtol(ck[0], NULL, 10);
 185             if (start < OPAL_HWLOC_L2CACHE_LEVEL) {
 186                 start = OPAL_HWLOC_L2CACHE_LEVEL;
 187                 cache_level = 2;
 188                 ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL2CACHE);
 189             }
 190             n++;
 191         } else if (0 == strncasecmp(ck[1], "l3cache", len)) {
 192             ppr[OPAL_HWLOC_L3CACHE_LEVEL] = strtol(ck[0], NULL, 10);
 193             if (start < OPAL_HWLOC_L3CACHE_LEVEL) {
 194                 start = OPAL_HWLOC_L3CACHE_LEVEL;
 195                 cache_level = 3;
 196                 ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL3CACHE);
 197             }
 198             n++;
 199         } else if (0 == strncasecmp(ck[1], "numa", len)) {
 200             ppr[OPAL_HWLOC_NUMA_LEVEL] = strtol(ck[0], NULL, 10);
 201             if (start < OPAL_HWLOC_NUMA_LEVEL) {
 202                 start = OPAL_HWLOC_NUMA_LEVEL;
 203                 ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNUMA);
 204             }
 205             n++;
 206         } else {
 207             /* unknown spec */
 208             orte_show_help("help-orte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], jdata->map->ppr);
 209             opal_argv_free(ppr_req);
 210             opal_argv_free(ck);
 211             return ORTE_ERR_SILENT;
 212         }
 213         opal_argv_free(ck);
 214     }
 215     opal_argv_free(ppr_req);
 216     /* if nothing was given, that's an error */
 217     if (0 == n) {
 218         opal_output(0, "NOTHING GIVEN");
 219         return ORTE_ERR_SILENT;
 220     }
 221     /* if more than one level was specified, then pruning will be reqd */
 222     if (1 < n) {
 223         pruning_reqd = true;
 224     }
 225 
 226     opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 227                         "mca:rmaps:ppr: job %s assigned policy %s",
 228                         ORTE_JOBID_PRINT(jdata->jobid),
 229                         orte_rmaps_base_print_mapping(jdata->map->mapping));
 230 
 231     /* convenience */
 232     level = start;
 233     lowest = opal_hwloc_levels[start];
 234 
 235     for (idx=0; idx < (orte_app_idx_t)jdata->apps->size; idx++) {
 236         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) {
 237             continue;
 238         }
 239 
 240         /* if the number of total procs was given, set that
 241          * limit - otherwise, set to max so we simply fill
 242          * all the nodes with the pattern
 243          */
 244         if (0 < app->num_procs) {
 245             total_procs = app->num_procs;
 246         } else {
 247             total_procs = ORTE_VPID_MAX;
 248         }
 249 
 250         /* get the available nodes */
 251         OBJ_CONSTRUCT(&node_list, opal_list_t);
 252         if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
 253                                                                   jdata->map->mapping, initial_map, false))) {
 254             ORTE_ERROR_LOG(rc);
 255             goto error;
 256         }
 257         /* flag that all subsequent requests should not reset the node->mapped flag */
 258         initial_map = false;
 259 
 260         /* if a bookmark exists from some prior mapping, set us to start there */
 261         jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata);
 262 
 263         /* cycle across the nodes */
 264         nprocs_mapped = 0;
 265         for (item = opal_list_get_first(&node_list);
 266              item != opal_list_get_end(&node_list);
 267              item = opal_list_get_next(item)) {
 268             node = (orte_node_t*)item;
 269             /* bozo check */
 270             if (NULL == node->topology || NULL == node->topology->topo) {
 271                 orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
 272                                true, node->name);
 273                 rc = ORTE_ERR_SILENT;
 274                 goto error;
 275             }
 276             /* add the node to the map, if needed */
 277             if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
 278                 ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
 279                 OBJ_RETAIN(node);
 280                 opal_pointer_array_add(jdata->map->nodes, node);
 281                 jdata->map->num_nodes++;
 282             }
 283             /* if we are mapping solely at the node level, just put
 284              * that many procs on this node
 285              */
 286             if (OPAL_HWLOC_NODE_LEVEL == start) {
 287                 obj = hwloc_get_root_obj(node->topology->topo);
 288                 for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
 289                     if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
 290                         rc = ORTE_ERR_OUT_OF_RESOURCE;
 291                         goto error;
 292                     }
 293                     nprocs_mapped++;
 294                     orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
 295                 }
 296             } else {
 297                 /* get the number of lowest resources on this node */
 298                 nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
 299                                                            lowest, cache_level,
 300                                                            OPAL_HWLOC_AVAILABLE);
 301 
 302                 /* map the specified number of procs to each such resource on this node,
 303                  * recording the locale of each proc so we know its cpuset
 304                  */
 305                 for (i=0; i < nobjs; i++) {
 306                     obj = opal_hwloc_base_get_obj_by_type(node->topology->topo,
 307                                                           lowest, cache_level,
 308                                                           i, OPAL_HWLOC_AVAILABLE);
 309                     for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
 310                         if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
 311                             rc = ORTE_ERR_OUT_OF_RESOURCE;
 312                             goto error;
 313                         }
 314                         nprocs_mapped++;
 315                         orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
 316                     }
 317                 }
 318 
 319                 if (pruning_reqd) {
 320                     /* go up the ladder and prune the procs according to
 321                      * the specification, adjusting the count of procs on the
 322                      * node as we go
 323                      */
 324                     level--;
 325                     prune(jdata->jobid, idx, node, &level, &nprocs_mapped);
 326                 }
 327             }
 328 
 329             if (!(ORTE_MAPPING_DEBUGGER & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) {
 330                 /* set the total slots used */
 331                 if ((int)node->num_procs <= node->slots) {
 332                     node->slots_inuse = (int)node->num_procs;
 333                 } else {
 334                     node->slots_inuse = node->slots;
 335                 }
 336 
 337                 /* if no-oversubscribe was specified, check to see if
 338                  * we have violated the total slot specification - regardless,
 339                  * if slots_max was given, we are not allowed to violate it!
 340                  */
 341                 if ((node->slots < (int)node->num_procs) ||
 342                     (0 < node->slots_max && node->slots_max < (int)node->num_procs)) {
 343                     if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
 344                         orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
 345                                        true, node->num_procs, app->app);
 346                         ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 347                         rc = ORTE_ERR_SILENT;
 348                         goto error;
 349                     }
 350                     /* flag the node as oversubscribed so that sched-yield gets
 351                      * properly set
 352                      */
 353                     ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
 354                     ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
 355                     /* check for permission */
 356                     if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
 357                         /* if we weren't given a directive either way, then we will error out
 358                          * as the #slots were specifically given, either by the host RM or
 359                          * via hostfile/dash-host */
 360                         if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) {
 361                             orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
 362                                            true, app->num_procs, app->app);
 363                             ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 364                             return ORTE_ERR_SILENT;
 365                         } else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
 366                             /* if we were explicitly told not to oversubscribe, then don't */
 367                             orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
 368                                            true, app->num_procs, app->app);
 369                             ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 370                             return ORTE_ERR_SILENT;
 371                         }
 372                     }
 373                 }
 374             }
 375 
 376             /* if we haven't mapped all the procs, continue on to the
 377              * next node
 378              */
 379             if (total_procs == nprocs_mapped) {
 380                 break;
 381             }
 382         }
 383         if (0 == app->num_procs) {
 384             app->num_procs = nprocs_mapped;
 385         }
 386         if (ORTE_VPID_MAX != total_procs && nprocs_mapped < total_procs) {
 387             /* couldn't map them all */
 388             orte_show_help("help-orte-rmaps-ppr.txt", "ppr-too-many-procs",
 389                            true, app->app, app->num_procs, jdata->map->ppr);
 390             rc = ORTE_ERR_SILENT;
 391             goto error;
 392         }
 393 
 394         /* track the total number of processes we mapped - must update
 395          * this AFTER we compute vpids so that computation is done
 396          * correctly
 397          */
 398         jdata->num_procs += app->num_procs;
 399 
 400         while (NULL != (item = opal_list_remove_first(&node_list))) {
 401             OBJ_RELEASE(item);
 402         }
 403         OBJ_DESTRUCT(&node_list);
 404     }
 405     return ORTE_SUCCESS;
 406 
 407   error:
 408     while (NULL != (item = opal_list_remove_first(&node_list))) {
 409         OBJ_RELEASE(item);
 410     }
 411     OBJ_DESTRUCT(&node_list);
 412     return rc;
 413 }
 414 
 415 static hwloc_obj_t find_split(hwloc_topology_t topo, hwloc_obj_t obj)
 416 {
 417     unsigned k;
 418     hwloc_obj_t nxt;
 419 
 420     if (1 < obj->arity) {
 421         return obj;
 422     }
 423     for (k=0; k < obj->arity; k++) {
 424         nxt = find_split(topo, obj->children[k]);
 425         if (NULL != nxt) {
 426             return nxt;
 427         }
 428     }
 429     return NULL;
 430 }
 431 
 432 /* recursively climb the topology, pruning procs beyond that allowed
 433  * by the given ppr
 434  */
 435 static void prune(orte_jobid_t jobid,
 436                   orte_app_idx_t app_idx,
 437                   orte_node_t *node,
 438                   opal_hwloc_level_t *level,
 439                   orte_vpid_t *nmapped)
 440 {
 441     hwloc_obj_t obj, top;
 442     unsigned int i, nobjs;
 443     hwloc_obj_type_t lvl;
 444     unsigned cache_level = 0, k;
 445     int nprocs;
 446     hwloc_cpuset_t avail;
 447     int n, limit, nmax, nunder, idx, idxmax = 0;
 448     orte_proc_t *proc, *pptr, *procmax;
 449     opal_hwloc_level_t ll;
 450     char dang[64];
 451     hwloc_obj_t locale;
 452 
 453     opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 454                         "mca:rmaps:ppr: pruning level %d",
 455                         *level);
 456 
 457     /* convenience */
 458     ll = *level;
 459 
 460     /* convenience */
 461     lvl = opal_hwloc_levels[ll];
 462     limit = ppr[ll];
 463 
 464     if (0 == limit) {
 465         /* no limit at this level, so move up if necessary */
 466         if (0 == ll) {
 467             /* done */
 468             return;
 469         }
 470         --(*level);
 471         prune(jobid, app_idx, node, level, nmapped);
 472         return;
 473     }
 474 
 475     /* handle the darn cache thing again */
 476     if (OPAL_HWLOC_L3CACHE_LEVEL == ll) {
 477         cache_level = 3;
 478     } else if (OPAL_HWLOC_L2CACHE_LEVEL == ll) {
 479         cache_level = 2;
 480     } else if (OPAL_HWLOC_L1CACHE_LEVEL == ll) {
 481         cache_level = 1;
 482     }
 483 
 484     /* get the number of resources at this level on this node */
 485     nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
 486                                                lvl, cache_level,
 487                                                OPAL_HWLOC_AVAILABLE);
 488 
 489     /* for each resource, compute the number of procs sitting
 490      * underneath it and check against the limit
 491      */
 492     for (i=0; i < nobjs; i++) {
 493         obj = opal_hwloc_base_get_obj_by_type(node->topology->topo,
 494                                               lvl, cache_level,
 495                                               i, OPAL_HWLOC_AVAILABLE);
 496         /* get the available cpuset */
 497         avail = obj->cpuset;
 498 
 499         /* look at the intersection of this object's cpuset and that
 500          * of each proc in the job/app - if they intersect, then count this proc
 501          * against the limit
 502          */
 503         nprocs = 0;
 504         for (n=0; n < node->procs->size; n++) {
 505             if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
 506                 continue;
 507             }
 508             if (proc->name.jobid != jobid ||
 509                 proc->app_idx != app_idx) {
 510                 continue;
 511             }
 512             locale = NULL;
 513             if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
 514                 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 515                 return;
 516             }
 517             if (hwloc_bitmap_intersects(avail, locale->cpuset)) {
 518                 nprocs++;
 519             }
 520         }
 521         opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 522                             "mca:rmaps:ppr: found %d procs limit %d",
 523                             nprocs, limit);
 524 
 525         /* check against the limit */
 526         while (limit < nprocs) {
 527             /* need to remove procs - do this in a semi-intelligent
 528              * manner to provide a little load balancing by cycling
 529              * across the objects beneath this one, removing procs
 530              * in a round-robin fashion until the limit is satisfied
 531              *
 532              * NOTE: I'm sure someone more knowledgeable with hwloc
 533              * will come up with a more efficient way to do this, so
 534              * consider this is a starting point
 535              */
 536 
 537             /* find the first level that has more than
 538              * one child beneath it - if all levels
 539              * have only one child, then return this
 540              * object
 541              */
 542             top = find_split(node->topology->topo, obj);
 543             hwloc_obj_type_snprintf(dang, 64, top, 1);
 544             opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 545                                 "mca:rmaps:ppr: SPLIT AT LEVEL %s", dang);
 546 
 547             /* cycle across the children of this object */
 548             nmax = 0;
 549             procmax = NULL;
 550             idx = 0;
 551             /* find the child with the most procs underneath it */
 552             for (k=0; k < top->arity && limit < nprocs; k++) {
 553                 /* get this object's available cpuset */
 554                 nunder = 0;
 555                 pptr = NULL;
 556                 for (n=0; n < node->procs->size; n++) {
 557                     if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
 558                         continue;
 559                     }
 560                     if (proc->name.jobid != jobid ||
 561                         proc->app_idx != app_idx) {
 562                         continue;
 563                     }
 564                     locale = NULL;
 565                     if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
 566                         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 567                         return;
 568                     }
 569                     if (hwloc_bitmap_intersects(top->children[k]->cpuset, locale->cpuset)) {
 570                         nunder++;
 571                         if (NULL == pptr) {
 572                             /* save the location of the first proc under this object */
 573                             pptr = proc;
 574                             idx = n;
 575                         }
 576                     }
 577                 }
 578                 if (nmax < nunder) {
 579                     opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 580                                         "mca:rmaps:ppr: PROCS UNDER CHILD %d %d MAX %d",
 581                                         k, nunder, nmax);
 582                     nmax = nunder;
 583                     procmax = pptr;
 584                     idxmax = idx;
 585                 }
 586             }
 587             if (NULL == procmax) {
 588                 /* can't find anything to remove - error out */
 589                 goto error;
 590             }
 591             /* remove it */
 592             opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 593                                 "mca:rmaps:ppr: removing proc at posn %d",
 594                                 idxmax);
 595             opal_pointer_array_set_item(node->procs, idxmax, NULL);
 596             node->num_procs--;
 597             node->slots_inuse--;
 598             if (node->slots_inuse < 0) {
 599                 node->slots_inuse = 0;
 600             }
 601             nprocs--;
 602             *nmapped -= 1;
 603             OBJ_RELEASE(procmax);
 604         }
 605     }
 606     /* finished with this level - move up if necessary */
 607     if (0 == ll) {
 608         return;
 609     }
 610     --(*level);
 611     prune(jobid, app_idx, node, level, nmapped);
 612     return;
 613 
 614  error:
 615     opal_output(0, "INFINITE LOOP");
 616 }
 617 
 618 static int assign_locations(orte_job_t *jdata)
 619 {
 620     int i, j, m, n;
 621     mca_base_component_t *c=&mca_rmaps_ppr_component.base_version;
 622     orte_node_t *node;
 623     orte_proc_t *proc;
 624     orte_app_context_t *app;
 625     hwloc_obj_type_t level;
 626     hwloc_obj_t obj;
 627     unsigned int cache_level=0;
 628     int ppr, cnt, nobjs, nprocs_mapped;
 629     char **ppr_req, **ck;
 630 
 631     if (NULL == jdata->map->last_mapper ||
 632         0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) {
 633         /* a mapper has been specified, and it isn't me */
 634         opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 635                             "mca:rmaps:ppr: job %s not using ppr assign: %s",
 636                             ORTE_JOBID_PRINT(jdata->jobid),
 637                             (NULL == jdata->map->last_mapper) ? "NULL" : jdata->map->last_mapper);
 638         return ORTE_ERR_TAKE_NEXT_OPTION;
 639     }
 640 
 641     opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 642                         "mca:rmaps:ppr: assigning locations for job %s with ppr %s policy %s",
 643                         ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr,
 644                         orte_rmaps_base_print_mapping(jdata->map->mapping));
 645 
 646     /* pickup the object level */
 647     if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
 648         level = HWLOC_OBJ_MACHINE;
 649     } else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
 650         level = HWLOC_OBJ_PU;
 651     } else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
 652         level = HWLOC_OBJ_CORE;
 653     } else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
 654         level = HWLOC_OBJ_SOCKET;
 655     } else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
 656         level = HWLOC_OBJ_L1CACHE;
 657         cache_level = 1;
 658     } else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
 659         level = HWLOC_OBJ_L2CACHE;
 660         cache_level = 2;
 661     } else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
 662         level = HWLOC_OBJ_L3CACHE;
 663         cache_level = 3;
 664     } else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
 665         level = HWLOC_OBJ_NUMANODE;
 666     } else {
 667         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 668         return ORTE_ERR_TAKE_NEXT_OPTION;
 669     }
 670 
 671     /* get the ppr value */
 672     ppr_req = opal_argv_split(jdata->map->ppr, ',');
 673     ck = opal_argv_split(ppr_req[0], ':');
 674     ppr = strtol(ck[0], NULL, 10);
 675     opal_argv_free(ck);
 676     opal_argv_free(ppr_req);
 677 
 678     /* start assigning procs to objects, filling each object as we go until
 679      * all procs are assigned. */
 680     for (n=0; n < jdata->apps->size; n++) {
 681         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) {
 682             continue;
 683         }
 684         nprocs_mapped = 0;
 685         for (m=0; m < jdata->map->nodes->size; m++) {
 686             if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) {
 687                 continue;
 688             }
 689             if (NULL == node->topology || NULL == node->topology->topo) {
 690                 orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
 691                                true, node->name);
 692                 return ORTE_ERR_SILENT;
 693             }
 694             if (HWLOC_OBJ_MACHINE == level) {
 695                 obj = hwloc_get_root_obj(node->topology->topo);
 696                 for (j=0; j < node->procs->size; j++) {
 697                     if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
 698                         continue;
 699                     }
 700                     if (proc->name.jobid != jdata->jobid) {
 701                         continue;
 702                     }
 703                     orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
 704                 }
 705             } else {
 706                 /* get the number of resources on this node at this level */
 707                 nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
 708                                                            level, cache_level,
 709                                                            OPAL_HWLOC_AVAILABLE);
 710 
 711                 /* map the specified number of procs to each such resource on this node,
 712                  * recording the locale of each proc so we know its cpuset
 713                  */
 714                 for (i=0; i < nobjs; i++) {
 715                     cnt = 0;
 716                     obj = opal_hwloc_base_get_obj_by_type(node->topology->topo,
 717                                                           level, cache_level,
 718                                                           i, OPAL_HWLOC_AVAILABLE);
 719                     for (j=0; j < node->procs->size && cnt < ppr && nprocs_mapped < app->num_procs; j++) {
 720                         if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
 721                             continue;
 722                         }
 723                         if (proc->name.jobid != jdata->jobid) {
 724                             continue;
 725                         }
 726                         /* if we already assigned it, then skip */
 727                         if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, NULL, OPAL_PTR)) {
 728                             continue;
 729                         }
 730                         nprocs_mapped++;
 731                         cnt++;
 732                         orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
 733                     }
 734                 }
 735             }
 736         }
 737     }
 738     return ORTE_SUCCESS;
 739 }

/* [<][>][^][v][top][bottom][index][help] */