This source file includes following definitions.
- mindist_map
- assign_locations
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 
  25 
  26 
  27 #include "orte_config.h"
  28 #include "orte/constants.h"
  29 #include "orte/types.h"
  30 
  31 #include <errno.h>
  32 #ifdef HAVE_UNISTD_H
  33 #include <unistd.h>
  34 #endif  
  35 #include <string.h>
  36 
  37 #include "opal/mca/base/mca_base_var.h"
  38 
  39 #include "orte/util/show_help.h"
  40 #include "orte/mca/errmgr/errmgr.h"
  41 #include "orte/util/error_strings.h"
  42 
  43 #include "orte/mca/rmaps/base/rmaps_private.h"
  44 #include "orte/mca/rmaps/base/base.h"
  45 #include "orte/mca/rmaps/mindist/rmaps_mindist.h"
  46 
  47 static int mindist_map(orte_job_t *jdata);
  48 static int assign_locations(orte_job_t *jdata);
  49 
  50 orte_rmaps_base_module_t orte_rmaps_mindist_module = {
  51     .map_job = mindist_map,
  52     .assign_locations = assign_locations
  53 };
  54 
  55 
  56 
  57 
  58 static int mindist_map(orte_job_t *jdata)
  59 {
  60     orte_app_context_t *app;
  61     int i, j;
  62     unsigned int k;
  63     hwloc_obj_t obj = NULL;
  64     opal_list_t node_list;
  65     opal_list_t numa_list;
  66     opal_list_item_t *item;
  67     opal_list_item_t *numa_item;
  68     opal_rmaps_numa_node_t *numa;
  69     orte_node_t *node;
  70     orte_proc_t *proc;
  71     int nprocs_mapped;
  72     int navg=0, nextra=0;
  73     orte_std_cntr_t num_nodes, num_slots;
  74     unsigned int npus, total_npus, num_procs_to_assign=0, required;
  75     int rc;
  76     mca_base_component_t *c = &mca_rmaps_mindist_component.base_version;
  77     bool initial_map=true;
  78     bool bynode = false;
  79     float balance;
  80     int extra_procs_to_assign=0, nxtra_nodes=0;
  81     bool add_one=false;
  82     bool oversubscribed=false;
  83     int ret;
  84 
  85     
  86 
  87 
  88     if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
  89         opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
  90                             "mca:rmaps:mindist: job %s is being restarted - mindist cannot map",
  91                             ORTE_JOBID_PRINT(jdata->jobid));
  92         return ORTE_ERR_TAKE_NEXT_OPTION;
  93     }
  94     if (NULL != jdata->map->req_mapper &&
  95         0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
  96         
  97         opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
  98                             "mca:rmaps:mindist: job %s not using mindist mapper",
  99                             ORTE_JOBID_PRINT(jdata->jobid));
 100         return ORTE_ERR_TAKE_NEXT_OPTION;
 101     }
 102     if (ORTE_MAPPING_BYDIST != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
 103         
 104         opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 105                             "mca:rmaps:mindist: job %s not using mindist mapper",
 106                             ORTE_JOBID_PRINT(jdata->jobid));
 107         return ORTE_ERR_TAKE_NEXT_OPTION;
 108     }
 109 
 110     
 111 
 112 
 113 
 114 
 115 
 116 
 117 
 118 
 119 
 120 
 121 
 122 
 123 
 124 
 125     if (ORTE_MAPPING_SPAN & jdata->map->mapping) {
 126         
 127         bynode = true;
 128     } else {
 129         
 130         bynode = false;
 131     }
 132 
 133     opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 134                         "mca:rmaps:mindist: mapping job %s",
 135                         ORTE_JOBID_PRINT(jdata->jobid));
 136 
 137     
 138     if (NULL != jdata->map->last_mapper) {
 139         free(jdata->map->last_mapper);
 140     }
 141     jdata->map->last_mapper = strdup(c->mca_component_name);
 142 
 143     
 144     jdata->num_procs = 0;
 145 
 146     
 147     for(i=0; i < jdata->apps->size; i++) {
 148         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
 149             continue;
 150         }
 151 
 152         
 153         OBJ_CONSTRUCT(&node_list, opal_list_t);
 154 
 155         
 156 
 157 
 158 
 159         if (0 == app->num_procs && 1 < jdata->num_apps) {
 160             orte_show_help("help-orte-rmaps-md.txt", "multi-apps-and-zero-np",
 161                            true, jdata->num_apps, NULL);
 162             rc = ORTE_ERR_SILENT;
 163             goto error;
 164         }
 165 
 166         
 167 
 168 
 169 
 170         if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
 171                                                                   jdata->map->mapping, initial_map, false))) {
 172             ORTE_ERROR_LOG(rc);
 173             goto error;
 174         }
 175 
 176         
 177         if (num_slots < (int)app->num_procs) {
 178             if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
 179                 orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
 180                                true, app->num_procs, app->app, orte_process_info.nodename);
 181                 ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 182                 return ORTE_ERR_SILENT;
 183             }
 184             oversubscribed = true;
 185         }
 186 
 187         num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list);
 188         
 189         initial_map = false;
 190 
 191         
 192         jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata);
 193 
 194         if (0 == app->num_procs) {
 195             
 196             app->num_procs = num_slots;
 197         }
 198 
 199         nprocs_mapped = 0;
 200         if (!num_nodes) {
 201             rc = ORTE_ERR_SILENT;
 202             goto error;
 203         }
 204         do {
 205             if (bynode || (app->num_procs > num_slots)) {
 206                 
 207                 bynode = true;
 208                 
 209                 navg = ((int)app->num_procs - nprocs_mapped) / num_nodes;
 210                 nextra = app->num_procs - navg * num_nodes;
 211                 num_procs_to_assign = navg;
 212                 if (nextra > 0) {
 213                     num_procs_to_assign++;
 214                 }
 215                 
 216                 balance = (float)(((int)app->num_procs - nprocs_mapped) - (navg * num_nodes)) / (float)num_nodes;
 217                 extra_procs_to_assign = (int)balance;
 218                 nxtra_nodes = 0;
 219                 add_one = false;
 220                 if (0 < (balance - (float)extra_procs_to_assign)) {
 221                     
 222                     nxtra_nodes = ((int)app->num_procs - nprocs_mapped) - ((navg + extra_procs_to_assign) * num_nodes);
 223                     
 224 
 225 
 226                     extra_procs_to_assign++;
 227                     
 228                     add_one = true;
 229                 }
 230             }
 231 
 232             num_nodes = 0;
 233             
 234             for (item = opal_list_get_first(&node_list);
 235                     item != opal_list_get_end(&node_list);
 236                     item = opal_list_get_next(item)) {
 237                 node = (orte_node_t*)item;
 238 
 239                 if (NULL == node->topology || NULL == node->topology->topo) {
 240                     orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology",
 241                             true, node->name);
 242                     rc = ORTE_ERR_SILENT;
 243                     goto error;
 244                 }
 245                 
 246 
 247 
 248                 obj = hwloc_get_root_obj(node->topology->topo);
 249                 if (NULL == obj) {
 250                     orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology",
 251                             true, node->name);
 252                     rc = ORTE_ERR_SILENT;
 253                     goto error;
 254                 }
 255 
 256                 num_nodes++;
 257 
 258                 
 259                 if (opal_hwloc_use_hwthreads_as_cpus) {
 260                     total_npus = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_PU, 0, OPAL_HWLOC_AVAILABLE);
 261                 } else {
 262                     total_npus = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_CORE, 0, OPAL_HWLOC_AVAILABLE);
 263                 }
 264 
 265                 if (bynode) {
 266                     if (oversubscribed) {
 267                         
 268                         if (add_one) {
 269                             if (0 == nxtra_nodes) {
 270                                 --extra_procs_to_assign;
 271                                 add_one = false;
 272                             } else {
 273                                 --nxtra_nodes;
 274                             }
 275                         }
 276                         
 277                         num_procs_to_assign = navg + extra_procs_to_assign;
 278                     }else if (node->slots <= node->slots_inuse) {
 279                         
 280                         continue;
 281                     } else {
 282                         
 283 
 284 
 285 
 286 
 287 
 288 
 289                         
 290                         if (add_one) {
 291                             if (0 == nxtra_nodes) {
 292                                 --extra_procs_to_assign;
 293                                 add_one = false;
 294                             } else {
 295                                 --nxtra_nodes;
 296                             }
 297                         }
 298                         
 299                         if ((node->slots - node->slots_inuse) < (navg + extra_procs_to_assign)) {
 300                             num_procs_to_assign = node->slots - node->slots_inuse;
 301                             
 302                             if (num_procs_to_assign == 0) {
 303                                 continue;
 304                             }
 305                         } else {
 306                         
 307                             num_procs_to_assign = navg + extra_procs_to_assign;
 308                         }
 309                         opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 310                                             "mca:rmaps:mindist: %s node %s avg %d assign %d extra %d",
 311                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name,
 312                                             navg, num_procs_to_assign, extra_procs_to_assign);
 313                     }
 314                 } else {
 315                     num_procs_to_assign = ((int)app->num_procs - nprocs_mapped) > node->slots ?
 316                             node->slots : ((int)app->num_procs - nprocs_mapped);
 317                 }
 318 
 319                 if (bynode) {
 320                     if (total_npus < num_procs_to_assign) {
 321                         
 322                         if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
 323                             orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
 324                                     true, app->num_procs, app->app);
 325                             rc = ORTE_ERR_SILENT;
 326                             ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
 327                             goto error;
 328                         } else {
 329                             ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
 330                             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
 331                         }
 332                     }
 333                 }
 334                 
 335 
 336                 opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE);
 337                 OBJ_CONSTRUCT(&numa_list, opal_list_t);
 338                 ret = opal_hwloc_get_sorted_numa_list(node->topology->topo, orte_rmaps_base.device, &numa_list);
 339                 if (ret > 1) {
 340                     orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:several-devices",
 341                                    true, orte_rmaps_base.device, ret, node->name);
 342                     ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
 343                     rc = ORTE_ERR_TAKE_NEXT_OPTION;
 344                     goto error;
 345                 } else if (ret < 0) {
 346                     orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:device-not-found",
 347                             true, orte_rmaps_base.device, node->name);
 348                     ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
 349                     rc = ORTE_ERR_TAKE_NEXT_OPTION;
 350                     goto error;
 351                 }
 352                 if (opal_list_get_size(&numa_list) > 0) {
 353                     j = 0;
 354                     required = 0;
 355                     OPAL_LIST_FOREACH(numa, &numa_list, opal_rmaps_numa_node_t) {
 356                         
 357                         if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, HWLOC_OBJ_NODE, 0, numa->index, OPAL_HWLOC_AVAILABLE))) {
 358                             ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 359                             return ORTE_ERR_NOT_FOUND;
 360                         }
 361                         npus = opal_hwloc_base_get_npus(node->topology->topo, obj);
 362                         if (bynode) {
 363                             required = num_procs_to_assign;
 364                         } else {
 365                             required = (num_procs_to_assign-j) > npus ? npus : (num_procs_to_assign-j);
 366                         }
 367                         for (k = 0; (k < required) && (nprocs_mapped < app->num_procs); k++) {
 368                             if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, i))) {
 369                                 rc = ORTE_ERR_OUT_OF_RESOURCE;
 370                                 goto error;
 371                             }
 372                             nprocs_mapped++;
 373                             j++;
 374                             orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
 375                         }
 376                         if ((nprocs_mapped == (int)app->num_procs) || ((int)num_procs_to_assign == j)) {
 377                             break;
 378                         }
 379                     }
 380                     if (0 != j) {
 381                         
 382                         if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
 383                             ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
 384                             OBJ_RETAIN(node);  
 385                             jdata->map->num_nodes++;
 386                             opal_pointer_array_add(jdata->map->nodes, node);
 387                         }
 388                         opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
 389                                 "mca:rmaps:mindist: assigned %d procs to node %s",
 390                                 j, node->name);
 391                     }
 392                 } else {
 393                     if (hwloc_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_SOCKET) > 1) {
 394                         
 395                         orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:no-pci-locality-info",
 396                                 true, node->name);
 397                     }
 398                     
 399                     ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
 400                     rc = ORTE_ERR_TAKE_NEXT_OPTION;
 401                     goto error;
 402                 }
 403                 while (NULL != (numa_item = opal_list_remove_first(&numa_list))) {
 404                     OBJ_RELEASE(numa_item);
 405                 }
 406                 OBJ_DESTRUCT(&numa_list);
 407                 if (bynode) {
 408                     nextra--;
 409                     if (nextra == 0) {
 410                         num_procs_to_assign--;
 411                     }
 412                 }
 413             }
 414         } while(bynode && nprocs_mapped < app->num_procs && 0 < num_nodes);
 415 
 416         
 417 
 418 
 419 
 420         jdata->num_procs += app->num_procs;
 421 
 422         
 423 
 424 
 425         while (NULL != (item = opal_list_remove_first(&node_list))) {
 426             OBJ_RELEASE(item);
 427         }
 428         OBJ_DESTRUCT(&node_list);
 429     }
 430     free(orte_rmaps_base.device);
 431 
 432     return ORTE_SUCCESS;
 433 
 434 error:
 435     while(NULL != (item = opal_list_remove_first(&node_list))) {
 436         OBJ_RELEASE(item);
 437     }
 438     OBJ_DESTRUCT(&node_list);
 439 
 440     return rc;
 441 }
 442 
 443 static int assign_locations(orte_job_t *jdata)
 444 {
 445     int j, k, m, n, npus;
 446     orte_app_context_t *app;
 447     orte_node_t *node;
 448     orte_proc_t *proc;
 449     hwloc_obj_t obj=NULL;
 450     mca_base_component_t *c = &mca_rmaps_mindist_component.base_version;
 451     int rc;
 452     opal_list_t numa_list;
 453     opal_rmaps_numa_node_t *numa;
 454 
 455     if (NULL == jdata->map->last_mapper||
 456         0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) {
 457         
 458         opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 459                             "mca:rmaps:mindist: job %s not using mindist mapper",
 460                             ORTE_JOBID_PRINT(jdata->jobid));
 461         return ORTE_ERR_TAKE_NEXT_OPTION;
 462     }
 463 
 464     opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 465                         "mca:rmaps:mindist: assign locations for job %s",
 466                         ORTE_JOBID_PRINT(jdata->jobid));
 467 
 468     
 469 
 470 
 471 
 472     for (n=0; n < jdata->apps->size; n++) {
 473         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) {
 474             continue;
 475         }
 476         for (m=0; m < jdata->map->nodes->size; m++) {
 477             if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) {
 478                 continue;
 479             }
 480             if (NULL == node->topology || NULL == node->topology->topo) {
 481                 orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
 482                                true, node->name);
 483                 return ORTE_ERR_SILENT;
 484             }
 485 
 486             
 487 
 488             opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE);
 489             OBJ_CONSTRUCT(&numa_list, opal_list_t);
 490             rc = opal_hwloc_get_sorted_numa_list(node->topology->topo, orte_rmaps_base.device, &numa_list);
 491             if (rc > 1) {
 492                 orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:several-devices",
 493                                true, orte_rmaps_base.device, rc, node->name);
 494                 ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
 495                 OPAL_LIST_DESTRUCT(&numa_list);
 496                 return ORTE_ERR_TAKE_NEXT_OPTION;
 497             } else if (rc < 0) {
 498                 orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:device-not-found",
 499                         true, orte_rmaps_base.device, node->name);
 500                 ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT);
 501                 OPAL_LIST_DESTRUCT(&numa_list);
 502                 return ORTE_ERR_TAKE_NEXT_OPTION;
 503             }
 504             j = 0;
 505             OPAL_LIST_FOREACH(numa, &numa_list, opal_rmaps_numa_node_t) {
 506                 
 507                 if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, HWLOC_OBJ_NODE, 0, numa->index, OPAL_HWLOC_AVAILABLE))) {
 508                     ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 509                     OPAL_LIST_DESTRUCT(&numa_list);
 510                     return ORTE_ERR_NOT_FOUND;
 511                 }
 512                 npus = opal_hwloc_base_get_npus(node->topology->topo, obj);
 513                 
 514 
 515                 for (k = j; k < node->procs->size && 0 < npus; k++) {
 516                     if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, k))) {
 517                         continue;
 518                     }
 519                     if (proc->name.jobid != jdata->jobid) {
 520                         continue;
 521                     }
 522                     orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
 523                     opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
 524                         "mca:rmaps:mindist: assigning proc %d to numa %d", k, numa->index);
 525                     ++j;
 526                     --npus;
 527                 }
 528             }
 529             OPAL_LIST_DESTRUCT(&numa_list);
 530         }
 531     }
 532 
 533     return ORTE_SUCCESS;
 534 }