This source file includes following definitions.
- orte_ras_base_display_alloc
- orte_ras_base_allocate
- orte_ras_base_add_hosts
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 #include "orte_config.h"
  25 
  26 #include <string.h>
  27 
  28 #include "orte/constants.h"
  29 #include "orte/types.h"
  30 
  31 #include "orte/mca/mca.h"
  32 #include "opal/mca/base/base.h"
  33 #include "opal/class/opal_list.h"
  34 #include "opal/util/output.h"
  35 #include "opal/util/printf.h"
  36 #include "opal/dss/dss.h"
  37 #include "opal/util/argv.h"
  38 #include "opal/mca/if/if.h"
  39 
  40 #include "orte/util/show_help.h"
  41 #include "orte/mca/errmgr/errmgr.h"
  42 #include "orte/mca/rmaps/base/base.h"
  43 #include "orte/util/name_fns.h"
  44 #include "orte/runtime/orte_globals.h"
  45 #include "orte/runtime/orte_wait.h"
  46 #include "orte/util/hostfile/hostfile.h"
  47 #include "orte/util/dash_host/dash_host.h"
  48 #include "orte/util/proc_info.h"
  49 #include "orte/util/comm/comm.h"
  50 #include "orte/util/error_strings.h"
  51 #include "orte/util/threads.h"
  52 #include "orte/mca/state/state.h"
  53 #include "orte/runtime/orte_quit.h"
  54 
  55 #include "orte/mca/ras/base/ras_private.h"
  56 
  57 
  58 void orte_ras_base_display_alloc(void)
  59 {
  60     char *tmp=NULL, *tmp2, *tmp3;
  61     int i, istart;
  62     orte_node_t *alloc;
  63 
  64     if (orte_xml_output) {
  65         opal_asprintf(&tmp, "<allocation>\n");
  66     } else {
  67         opal_asprintf(&tmp, "\n======================   ALLOCATED NODES   ======================\n");
  68     }
  69     if (orte_hnp_is_allocated) {
  70             istart = 0;
  71     } else {
  72         istart = 1;
  73     }
  74     for (i=istart; i < orte_node_pool->size; i++) {
  75         if (NULL == (alloc = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
  76             continue;
  77         }
  78         if (orte_xml_output) {
  79             
  80             opal_asprintf(&tmp2, "\t<host name=\"%s\" slots=\"%d\" max_slots=\"%d\" slots_inuse=\"%d\">\n",
  81                      (NULL == alloc->name) ? "UNKNOWN" : alloc->name,
  82                      (int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse);
  83         } else {
  84             opal_asprintf(&tmp2, "\t%s: flags=0x%02x slots=%d max_slots=%d slots_inuse=%d state=%s\n",
  85                      (NULL == alloc->name) ? "UNKNOWN" : alloc->name, alloc->flags,
  86                      (int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse,
  87                      orte_node_state_to_str(alloc->state));
  88         }
  89         if (NULL == tmp) {
  90             tmp = tmp2;
  91         } else {
  92             opal_asprintf(&tmp3, "%s%s", tmp, tmp2);
  93             free(tmp);
  94             free(tmp2);
  95             tmp = tmp3;
  96         }
  97     }
  98     if (orte_xml_output) {
  99         fprintf(orte_xml_fp, "%s</allocation>\n", tmp);
 100         fflush(orte_xml_fp);
 101     } else {
 102         opal_output(orte_clean_output, "%s=================================================================\n", tmp);
 103     }
 104     free(tmp);
 105 }
 106 
 107 
 108 
 109 
 110 
 111 void orte_ras_base_allocate(int fd, short args, void *cbdata)
 112 {
 113     int rc;
 114     orte_job_t *jdata;
 115     opal_list_t nodes;
 116     orte_node_t *node;
 117     orte_std_cntr_t i;
 118     orte_app_context_t *app;
 119     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
 120     char *hosts=NULL;
 121 
 122     ORTE_ACQUIRE_OBJECT(caddy);
 123 
 124     OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
 125                          "%s ras:base:allocate",
 126                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 127 
 128     
 129     jdata = caddy->jdata;
 130 
 131     
 132 
 133 
 134     if (orte_ras_base.allocation_read) {
 135 
 136         OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
 137                              "%s ras:base:allocate allocation already read",
 138                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 139         goto next_state;
 140     }
 141     orte_ras_base.allocation_read = true;
 142 
 143     
 144 
 145 
 146 
 147 
 148 
 149 
 150 
 151 
 152     
 153     OBJ_CONSTRUCT(&nodes, opal_list_t);
 154 
 155     
 156 
 157 
 158     if (NULL != orte_ras_base.active_module)  {
 159         
 160         if (ORTE_SUCCESS != (rc = orte_ras_base.active_module->allocate(jdata, &nodes))) {
 161             if (ORTE_ERR_ALLOCATION_PENDING == rc) {
 162                 
 163                 OBJ_DESTRUCT(&nodes);
 164                 OBJ_RELEASE(caddy);
 165                 return;
 166             }
 167             if (ORTE_ERR_SYSTEM_WILL_BOOTSTRAP == rc) {
 168                 
 169 
 170 
 171 
 172                 goto addlocal;
 173             }
 174             if (ORTE_ERR_TAKE_NEXT_OPTION == rc) {
 175                 
 176 
 177 
 178 
 179 
 180                 if (orte_allocation_required) {
 181                     
 182                     OBJ_DESTRUCT(&nodes);
 183                     orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true);
 184                     ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 185                     OBJ_RELEASE(caddy);
 186                     return;
 187                 } else {
 188                     
 189 
 190 
 191                     goto addlocal;
 192                 }
 193             }
 194             ORTE_ERROR_LOG(rc);
 195             OBJ_DESTRUCT(&nodes);
 196             ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 197             OBJ_RELEASE(caddy);
 198             return;
 199         }
 200     }
 201     
 202     if (!opal_list_is_empty(&nodes)) {
 203         
 204         orte_managed_allocation = true;
 205         
 206 
 207         opal_if_do_not_resolve = true;
 208         
 209 
 210 
 211         if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
 212             ORTE_ERROR_LOG(rc);
 213             OBJ_DESTRUCT(&nodes);
 214             ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 215             OBJ_RELEASE(caddy);
 216             return;
 217         }
 218         OBJ_DESTRUCT(&nodes);
 219         goto DISPLAY;
 220     } else if (orte_allocation_required) {
 221         
 222 
 223 
 224         OBJ_DESTRUCT(&nodes);
 225         orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true);
 226         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 227         OBJ_RELEASE(caddy);
 228         return;
 229     }
 230 
 231     OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
 232                          "%s ras:base:allocate nothing found in module - proceeding to hostfile",
 233                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 234 
 235     
 236 
 237 
 238     if (NULL != orte_rankfile) {
 239         OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
 240                              "%s ras:base:allocate parsing rankfile %s",
 241                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 242                              orte_rankfile));
 243 
 244         
 245         if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
 246                                                                orte_rankfile))) {
 247             OBJ_DESTRUCT(&nodes);
 248             ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 249             OBJ_RELEASE(caddy);
 250             return;
 251         }
 252     }
 253 
 254     
 255 
 256 
 257     if (!opal_list_is_empty(&nodes)) {
 258         
 259 
 260 
 261         if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
 262             ORTE_ERROR_LOG(rc);
 263             ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 264             OBJ_RELEASE(caddy);
 265             return;
 266         }
 267         
 268         if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
 269             ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
 270         }
 271         
 272         OBJ_DESTRUCT(&nodes);
 273         goto DISPLAY;
 274     }
 275 
 276     
 277 
 278 
 279     for (i=0; i < jdata->apps->size; i++) {
 280         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
 281             continue;
 282         }
 283         if (!orte_soft_locations &&
 284             orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) {
 285             
 286 
 287 
 288 
 289 
 290 
 291 
 292             OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
 293                                  "%s ras:base:allocate adding dash_hosts",
 294                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 295             if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, hosts, true))) {
 296                 free(hosts);
 297                 OBJ_DESTRUCT(&nodes);
 298                 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 299                 OBJ_RELEASE(caddy);
 300                 return;
 301             }
 302             free(hosts);
 303         }
 304     }
 305 
 306     
 307 
 308 
 309     if (!opal_list_is_empty(&nodes)) {
 310         
 311 
 312 
 313         if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
 314             ORTE_ERROR_LOG(rc);
 315             ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 316             OBJ_RELEASE(caddy);
 317             return;
 318         }
 319         
 320         OBJ_DESTRUCT(&nodes);
 321         goto DISPLAY;
 322     }
 323 
 324     
 325 
 326 
 327 
 328 
 329 
 330 
 331 
 332 
 333 
 334 
 335 
 336 
 337 
 338 
 339     for (i=0; i < jdata->apps->size; i++) {
 340         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
 341             continue;
 342         }
 343         if (orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, (void**)&hosts, OPAL_STRING)) {
 344             OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
 345                                  "%s ras:base:allocate adding hostfile %s",
 346                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts));
 347 
 348             
 349             if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, hosts))) {
 350                 free(hosts);
 351                 OBJ_DESTRUCT(&nodes);
 352                 
 353                 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 354                 OBJ_RELEASE(caddy);
 355                 return;
 356             }
 357             free(hosts);
 358         }
 359     }
 360 
 361     
 362 
 363 
 364     if (!opal_list_is_empty(&nodes)) {
 365         
 366 
 367 
 368         if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
 369             ORTE_ERROR_LOG(rc);
 370             ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 371             OBJ_RELEASE(caddy);
 372             return;
 373         }
 374         
 375         OBJ_DESTRUCT(&nodes);
 376         goto DISPLAY;
 377     }
 378 
 379     
 380     if (NULL != orte_default_hostfile) {
 381         OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
 382                              "%s ras:base:allocate parsing default hostfile %s",
 383                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 384                              orte_default_hostfile));
 385 
 386         
 387         if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
 388                                                                orte_default_hostfile))) {
 389             OBJ_DESTRUCT(&nodes);
 390             ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 391             OBJ_RELEASE(caddy);
 392             return;
 393         }
 394     }
 395 
 396     
 397 
 398 
 399     if (!opal_list_is_empty(&nodes)) {
 400         
 401 
 402 
 403         if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
 404             ORTE_ERROR_LOG(rc);
 405             ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 406             OBJ_RELEASE(caddy);
 407             return;
 408         }
 409         
 410         OBJ_DESTRUCT(&nodes);
 411         goto DISPLAY;
 412     }
 413 
 414     OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
 415                          "%s ras:base:allocate nothing found in hostfiles - inserting current node",
 416                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 417 
 418   addlocal:
 419     
 420 
 421 
 422     node = OBJ_NEW(orte_node_t);
 423     if (NULL == node) {
 424         ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
 425         OBJ_DESTRUCT(&nodes);
 426         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 427         OBJ_RELEASE(caddy);
 428         return;
 429     }
 430     
 431 
 432 
 433     node->name = strdup(orte_process_info.nodename);
 434     node->state = ORTE_NODE_STATE_UP;
 435     node->slots_inuse = 0;
 436     node->slots_max = 0;
 437     node->slots = 1;
 438     opal_list_append(&nodes, &node->super);
 439     
 440     orte_hnp_is_allocated = true;
 441 
 442     
 443 
 444 
 445     if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
 446         ORTE_ERROR_LOG(rc);
 447         OBJ_DESTRUCT(&nodes);
 448         ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 449         OBJ_RELEASE(caddy);
 450         return;
 451     }
 452     OBJ_DESTRUCT(&nodes);
 453 
 454   DISPLAY:
 455     
 456     if (4 < opal_output_get_verbosity(orte_ras_base_framework.framework_output)) {
 457         orte_ras_base_display_alloc();
 458     }
 459 
 460   next_state:
 461     
 462     if (orte_report_events) {
 463         if (ORTE_SUCCESS != (rc = orte_util_comm_report_event(ORTE_COMM_EVENT_ALLOCATE))) {
 464             ORTE_ERROR_LOG(rc);
 465             ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
 466             OBJ_RELEASE(caddy);
 467         }
 468     }
 469 
 470     
 471     jdata->total_slots_alloc = orte_ras_base.total_slots_alloc;
 472 
 473     
 474     ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATION_COMPLETE);
 475 
 476     
 477     OBJ_RELEASE(caddy);
 478 }
 479 
 480 int orte_ras_base_add_hosts(orte_job_t *jdata)
 481 {
 482     int rc;
 483     opal_list_t nodes;
 484     int i, n;
 485     orte_app_context_t *app;
 486     orte_node_t *node, *next, *nptr;
 487     char *hosts;
 488 
 489     
 490     OBJ_CONSTRUCT(&nodes, opal_list_t);
 491 
 492     
 493 
 494 
 495 
 496 
 497 
 498 
 499 
 500 
 501 
 502 
 503 
 504 
 505     for (i=0; i < jdata->apps->size; i++) {
 506         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
 507             continue;
 508         }
 509         if (orte_get_attribute(&app->attributes, ORTE_APP_ADD_HOSTFILE, (void**)&hosts, OPAL_STRING)) {
 510             OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
 511                                  "%s ras:base:add_hosts checking add-hostfile %s",
 512                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts));
 513 
 514             
 515             if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, hosts))) {
 516                 ORTE_ERROR_LOG(rc);
 517                 OBJ_DESTRUCT(&nodes);
 518                 free(hosts);
 519                 return rc;
 520             }
 521             
 522             orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, (void**)hosts, OPAL_STRING);
 523             orte_remove_attribute(&app->attributes, ORTE_APP_ADD_HOSTFILE);
 524             free(hosts);
 525         }
 526     }
 527 
 528     
 529 
 530 
 531 
 532 
 533 
 534 
 535 
 536 
 537     for (i=0; i < jdata->apps->size; i++) {
 538         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
 539             continue;
 540         }
 541         if (orte_get_attribute(&app->attributes, ORTE_APP_ADD_HOST, (void**)&hosts, OPAL_STRING)) {
 542             opal_output_verbose(5, orte_ras_base_framework.framework_output,
 543                                 "%s ras:base:add_hosts checking add-host %s",
 544                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts);
 545             if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, hosts, true))) {
 546                 ORTE_ERROR_LOG(rc);
 547                 OBJ_DESTRUCT(&nodes);
 548                 free(hosts);
 549                 return rc;
 550             }
 551             
 552             orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, hosts, OPAL_STRING);
 553             orte_remove_attribute(&app->attributes, ORTE_APP_ADD_HOST);
 554             free(hosts);
 555         }
 556     }
 557 
 558     
 559     if (!opal_list_is_empty(&nodes)) {
 560         
 561 
 562 
 563         OPAL_LIST_FOREACH_SAFE(node, next, &nodes, orte_node_t) {
 564             node->state = ORTE_NODE_STATE_ADDED;
 565             for (n=0; n < orte_node_pool->size; n++) {
 566                 if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) {
 567                     continue;
 568                 }
 569                 if (0 == strcmp(node->name, nptr->name)) {
 570                     opal_list_remove_item(&nodes, &node->super);
 571                     OBJ_RELEASE(node);
 572                     break;
 573                 }
 574             }
 575         }
 576         if (!opal_list_is_empty(&nodes)) {
 577             
 578 
 579 
 580             if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
 581                 ORTE_ERROR_LOG(rc);
 582             }
 583             
 584             orte_nidmap_communicated = false;
 585         }
 586     }
 587     
 588     OPAL_LIST_DESTRUCT(&nodes);
 589 
 590     
 591     if (0 < opal_output_get_verbosity(orte_ras_base_framework.framework_output)) {
 592         orte_ras_base_display_alloc();
 593     }
 594 
 595     return ORTE_SUCCESS;
 596 }